* [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 @ 2022-03-25 18:36 Noah Goldstein 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (6 more replies) 0 siblings, 7 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - 5 files changed, 2006 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 6507d1b7fa..51222dfab1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -12,7 +12,6 @@ sysdep_routines += \ memcmp-evex-movbe \ memcmp-sse2 \ memcmp-sse4 \ - memcmp-ssse3 \ memcmpeq-avx2 \ memcmpeq-avx2-rtm \ memcmpeq-evex \ @@ -179,7 +178,6 @@ sysdep_routines += \ wmemcmp-c \ wmemcmp-evex-movbe \ wmemcmp-sse4 \ - wmemcmp-ssse3 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 40cc6cc49e..f389928a4e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), - __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) #ifdef SHARED @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), - __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) /* Support sysdeps/x86_64/multiarch/wmemset.c. */ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index cd12613699..44759a3ad5 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -20,7 +20,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S deleted file mode 100644 index df1b1fc494..0000000000 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ /dev/null @@ -1,1992 +0,0 @@ -/* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -# endif - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - atom_text_section -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP - test %RDX_LP, %RDX_LP - jz L(equal) -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -# endif - mov %rdx, %rcx - mov %rdi, %rdx - cmp $48, %rcx; - jae L(48bytesormore) /* LEN => 48 */ - - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -/* ECX >= 32. */ -L(48bytesormore): - movdqu (%rdi), %xmm3 - movdqu (%rsi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%rdi), %rdi - lea 16(%rsi), %rsi - sub $0xffff, %edx - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %rdx, %rdi - sub %rdx, %rsi - add %rdx, %rcx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %rdx, %rsi - -# ifndef USE_AS_WMEMCMP - cmp $8, %edx - jae L(next_unaligned_table) - cmp $0, %edx - je L(shr_0) - cmp $1, %edx - je L(shr_1) - cmp $2, %edx - je L(shr_2) - cmp $3, %edx - je L(shr_3) - cmp $4, %edx - je L(shr_4) - cmp $5, %edx - je L(shr_5) - cmp $6, %edx - je L(shr_6) - jmp L(shr_7) - - .p2align 2 -L(next_unaligned_table): - cmp $8, %edx - je L(shr_8) - cmp $9, %edx - je L(shr_9) - cmp $10, %edx - je L(shr_10) - cmp $11, %edx - je L(shr_11) - cmp $12, %edx - je L(shr_12) - cmp $13, %edx - je L(shr_13) - cmp $14, %edx - je L(shr_14) - jmp L(shr_15) -# else - cmp $0, %edx - je L(shr_0) - cmp $4, %edx - je L(shr_4) - cmp $8, %edx - je L(shr_8) - jmp L(shr_12) -# endif - - .p2align 4 -L(shr_0): - cmp $80, %rcx - lea -48(%rcx), %rcx - jae L(shr_0_gobble) - xor %eax, %eax - movdqa (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_0_gobble): - movdqa (%rsi), %xmm0 - xor %eax, %eax - pcmpeqb (%rdi), %xmm0 - sub $32, %rcx - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 -L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %rcx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%rsi), %xmm0 - movdqa 48(%rsi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%rdi), %xmm0 - pcmpeqb 48(%rdi), %xmm2 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %rcx - jge L(next) - inc %edx - add $32, %rcx -L(next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_1): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_1_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $1, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $1, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $1, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_1_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $1, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $1, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_1_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $1, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $1, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_1_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_1_gobble_next) - inc %edx - add $32, %rcx -L(shr_1_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 1(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - - .p2align 4 -L(shr_2): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $2, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $2, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_2_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $2, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $2, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $2, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $2, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_2_gobble_next) - inc %edx - add $32, %rcx -L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 2(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_3_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $3, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $3, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $3, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $3, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $3, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_3_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $3, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $3, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_3_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_3_gobble_next) - inc %edx - add $32, %rcx -L(shr_3_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 3(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_4): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $4, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $4, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_4_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $4, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $4, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $4, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $4, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_4_gobble_next) - inc %edx - add $32, %rcx -L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 4(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_5): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_5_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $5, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $5, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $5, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_5_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $5, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $5, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_5_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $5, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $5, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_5_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_5_gobble_next) - inc %edx - add $32, %rcx -L(shr_5_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 5(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $6, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $6, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $6, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $6, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $6, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $6, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_6_gobble_next) - inc %edx - add $32, %rcx -L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 6(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_7_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $7, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $7, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $7, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $7, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $7, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_7_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $7, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $7, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_7_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_7_gobble_next) - inc %edx - add $32, %rcx -L(shr_7_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 7(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_8): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $8, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $8, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_8_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $8, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $8, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $8, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $8, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_8_gobble_next) - inc %edx - add $32, %rcx -L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 8(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_9): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_9_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $9, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $9, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $9, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_9_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $9, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $9, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_9_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $9, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $9, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_9_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_9_gobble_next) - inc %edx - add $32, %rcx -L(shr_9_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 9(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $10, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $10, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $10, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $10, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $10, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $10, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_10_gobble_next) - inc %edx - add $32, %rcx -L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 10(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_11_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $11, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $11, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $11, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $11, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $11, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_11_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $11, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $11, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_11_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_11_gobble_next) - inc %edx - add $32, %rcx -L(shr_11_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 11(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_12): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $12, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_12_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $12, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $12, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $12, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $12, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_12_gobble_next) - inc %edx - add $32, %rcx -L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 12(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_13): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_13_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $13, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $13, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $13, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_13_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $13, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $13, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_13_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $13, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $13, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_13_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_13_gobble_next) - inc %edx - add $32, %rcx -L(shr_13_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 13(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $14, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $14, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $14, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $14, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $14, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_14_gobble_next) - inc %edx - add $32, %rcx -L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 14(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_15_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $15, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $15, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $15, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $15, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $15, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_15_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $15, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $15, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_15_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_15_gobble_next) - inc %edx - add $32, %rcx -L(shr_15_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 15(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) -# endif - .p2align 4 -L(exit): - pmovmskb %xmm1, %r8d - sub $0xffff, %r8d - jz L(first16bytes) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - mov %r8d, %edx -L(first16bytes): - add %rax, %rsi -L(less16bytes): -# ifndef USE_AS_WMEMCMP - test %dl, %dl - jz L(next_24_bytes) - - test $0x01, %dl - jnz L(Byte16) - - test $0x02, %dl - jnz L(Byte17) - - test $0x04, %dl - jnz L(Byte18) - - test $0x08, %dl - jnz L(Byte19) - - test $0x10, %dl - jnz L(Byte20) - - test $0x20, %dl - jnz L(Byte21) - - test $0x40, %dl - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte16): - movzbl -16(%rdi), %eax - movzbl -16(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte17): - movzbl -15(%rdi), %eax - movzbl -15(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte18): - movzbl -14(%rdi), %eax - movzbl -14(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte19): - movzbl -13(%rdi), %eax - movzbl -13(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte20): - movzbl -12(%rdi), %eax - movzbl -12(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte21): - movzbl -11(%rdi), %eax - movzbl -11(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte22): - movzbl -10(%rdi), %eax - movzbl -10(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(next_24_bytes): - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - test $0x01, %dh - jnz L(Byte16) - - test $0x02, %dh - jnz L(Byte17) - - test $0x04, %dh - jnz L(Byte18) - - test $0x08, %dh - jnz L(Byte19) - - test $0x10, %dh - jnz L(Byte20) - - test $0x20, %dh - jnz L(Byte21) - - test $0x40, %dh - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret -# else -/* special for wmemcmp */ - xor %eax, %eax - test %dl, %dl - jz L(next_two_double_words) - and $15, %dl - jz L(second_double_word) - mov -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(second_double_word): - mov -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(next_two_double_words): - and $15, %dh - jz L(fourth_double_word) - mov -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(fourth_double_word): - mov -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) - ret -# endif - - .p2align 4 -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $0, %ecx - je L(0bytes) -# ifndef USE_AS_WMEMCMP - cmp $1, %ecx - je L(1bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) -# else - jmp L(4bytes) -# endif - - .p2align 4 -L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) -# ifndef USE_AS_WMEMCMP - cmp $9, %ecx - je L(9bytes) - cmp $10, %ecx - je L(10bytes) - cmp $11, %ecx - je L(11bytes) - cmp $12, %ecx - je L(12bytes) - cmp $13, %ecx - je L(13bytes) - cmp $14, %ecx - je L(14bytes) - jmp L(15bytes) -# else - jmp L(12bytes) -# endif - - .p2align 4 -L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) -# ifndef USE_AS_WMEMCMP - cmp $17, %ecx - je L(17bytes) - cmp $18, %ecx - je L(18bytes) - cmp $19, %ecx - je L(19bytes) - cmp $20, %ecx - je L(20bytes) - cmp $21, %ecx - je L(21bytes) - cmp $22, %ecx - je L(22bytes) - jmp L(23bytes) -# else - jmp L(20bytes) -# endif - - .p2align 4 -L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) -# ifndef USE_AS_WMEMCMP - cmp $25, %ecx - je L(25bytes) - cmp $26, %ecx - je L(26bytes) - cmp $27, %ecx - je L(27bytes) - cmp $28, %ecx - je L(28bytes) - cmp $29, %ecx - je L(29bytes) - cmp $30, %ecx - je L(30bytes) - jmp L(31bytes) -# else - jmp L(28bytes) -# endif - - .p2align 4 -L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) -# ifndef USE_AS_WMEMCMP - cmp $33, %ecx - je L(33bytes) - cmp $34, %ecx - je L(34bytes) - cmp $35, %ecx - je L(35bytes) - cmp $36, %ecx - je L(36bytes) - cmp $37, %ecx - je L(37bytes) - cmp $38, %ecx - je L(38bytes) - jmp L(39bytes) -# else - jmp L(36bytes) -# endif - - .p2align 4 -L(more40bytes): - cmp $40, %ecx - je L(40bytes) -# ifndef USE_AS_WMEMCMP - cmp $41, %ecx - je L(41bytes) - cmp $42, %ecx - je L(42bytes) - cmp $43, %ecx - je L(43bytes) - cmp $44, %ecx - je L(44bytes) - cmp $45, %ecx - je L(45bytes) - cmp $46, %ecx - je L(46bytes) - jmp L(47bytes) - - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - movl -44(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - movl -40(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - movl -36(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - movl -32(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - movl -28(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - movl -24(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - movl -20(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - movl -16(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - movl -12(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - movl -8(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - movl -4(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# else - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - cmp -44(%rsi), %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - cmp -40(%rsi), %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - cmp -36(%rsi), %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - cmp -32(%rsi), %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - cmp -28(%rsi), %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - cmp -24(%rsi), %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - cmp -20(%rsi), %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(45bytes): - movl -45(%rdi), %eax - movl -45(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(41bytes): - movl -41(%rdi), %eax - movl -41(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(37bytes): - movl -37(%rdi), %eax - movl -37(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(33bytes): - movl -33(%rdi), %eax - movl -33(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(29bytes): - movl -29(%rdi), %eax - movl -29(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(25bytes): - movl -25(%rdi), %eax - movl -25(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(21bytes): - movl -21(%rdi), %eax - movl -21(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(17bytes): - movl -17(%rdi), %eax - movl -17(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(13bytes): - movl -13(%rdi), %eax - movl -13(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(9bytes): - movl -9(%rdi), %eax - movl -9(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(5bytes): - movl -5(%rdi), %eax - movl -5(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(1bytes): - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(46bytes): - movl -46(%rdi), %eax - movl -46(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(42bytes): - movl -42(%rdi), %eax - movl -42(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(38bytes): - movl -38(%rdi), %eax - movl -38(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(34bytes): - movl -34(%rdi), %eax - movl -34(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(30bytes): - movl -30(%rdi), %eax - movl -30(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(26bytes): - movl -26(%rdi), %eax - movl -26(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(22bytes): - movl -22(%rdi), %eax - movl -22(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(18bytes): - movl -18(%rdi), %eax - movl -18(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(14bytes): - movl -14(%rdi), %eax - movl -14(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(10bytes): - movl -10(%rdi), %eax - movl -10(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(6bytes): - movl -6(%rdi), %eax - movl -6(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(2bytes): - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(47bytes): - movl -47(%rdi), %eax - movl -47(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(43bytes): - movl -43(%rdi), %eax - movl -43(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(39bytes): - movl -39(%rdi), %eax - movl -39(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(35bytes): - movl -35(%rdi), %eax - movl -35(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(31bytes): - movl -31(%rdi), %eax - movl -31(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(27bytes): - movl -27(%rdi), %eax - movl -27(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(23bytes): - movl -23(%rdi), %eax - movl -23(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(19bytes): - movl -19(%rdi), %eax - movl -19(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(15bytes): - movl -15(%rdi), %eax - movl -15(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(11bytes): - movl -11(%rdi), %eax - movl -11(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(7bytes): - movl -7(%rdi), %eax - movl -7(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(3bytes): - movzwl -3(%rdi), %eax - movzwl -3(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(find_diff): - cmpb %cl, %al - jne L(set) - cmpw %cx, %ax - jne L(set) - shr $16, %eax - shr $16, %ecx - cmpb %cl, %al - jne L(set) - -/* We get there only if we already know there is a -difference. */ - - cmp %ecx, %eax -L(set): - sbb %eax, %eax - sbb $-1, %eax - ret -# else - -/* for wmemcmp */ - .p2align 4 -L(find_diff): - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret -# endif - - .p2align 4 -L(equal): - xor %eax, %eax - ret - -END (MEMCMP) -#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S deleted file mode 100644 index a41ef95fc1..0000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_ssse3 - -#include "memcmp-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-03-25 18:36 ` Noah Goldstein 2022-03-25 19:55 ` H.J. Lu ` (9 more replies) 2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein ` (5 subsequent siblings) 6 siblings, 10 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - sysdeps/x86_64/multiarch/strcmp.c | 4 - sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- sysdeps/x86_64/multiarch/strncmp.c | 4 - sysdeps/x86_64/strcmp.S | 155 ++++-------------- 10 files changed, 30 insertions(+), 202 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 51222dfab1..ed2def288d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -58,7 +58,6 @@ sysdep_routines += \ strcasecmp_l-evex \ strcasecmp_l-sse2 \ strcasecmp_l-sse4_2 \ - strcasecmp_l-ssse3 \ strcat-avx2 \ strcat-avx2-rtm \ strcat-evex \ @@ -80,7 +79,6 @@ sysdep_routines += \ strcmp-sse2 \ strcmp-sse2-unaligned \ strcmp-sse4_2 \ - strcmp-ssse3 \ strcpy-avx2 \ strcpy-avx2-rtm \ strcpy-evex \ @@ -98,7 +96,6 @@ sysdep_routines += \ strncase_l-evex \ strncase_l-sse2 \ strncase_l-sse4_2 \ - strncase_l-ssse3 \ strncat-avx2 \ strncat-avx2-rtm \ strncat-c \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncmp-evex \ strncmp-sse2 \ strncmp-sse4_2 \ - strncmp-ssse3 \ strncpy-avx2 \ strncpy-avx2-rtm \ strncpy-c \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f389928a4e..7e2be3554b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), __strcmp_sse42) - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), - __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), __strncmp_sse42) - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), - __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) #ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h index 766539c241..296d32071b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -20,7 +20,6 @@ #include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S deleted file mode 100644 index fb2f9ae14a..0000000000 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strcasecmp_l_ssse3 -#define __strcasecmp __strcasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S deleted file mode 100644 index 1b7fa33c91..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S +++ /dev/null @@ -1,5 +0,0 @@ -#if IS_IN (libc) -# define USE_SSSE3 1 -# define STRCMP __strcmp_ssse3 -# include "../strcmp.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 68cb73baad..a248c2a6e6 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -28,7 +28,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S deleted file mode 100644 index 6728678688..0000000000 --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRNCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strncasecmp_l_ssse3 -#define __strncasecmp __strncasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S deleted file mode 100644 index ec37308347..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S +++ /dev/null @@ -1,28 +0,0 @@ -/* strcmp optimized with SSSE3. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define STRCMP __strncmp_ssse3 - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(strcmp) - -#define USE_SSSE3 1 -#define USE_AS_STRNCMP -#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index fca74199d8..70ae6547c9 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -27,7 +27,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 99d8b36f1d..c38dc627f9 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -59,12 +59,7 @@ # endif #endif -#ifndef USE_SSSE3 .text -#else - .section .text.ssse3,"ax",@progbits -#endif - #ifdef USE_AS_STRCASECMP_L # ifndef ENTRY2 # define ENTRY2(name) ENTRY (name) @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-03-25 19:55 ` H.J. Lu 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (8 subsequent siblings) 9 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:55 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - > sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - > sysdeps/x86_64/multiarch/strcmp.c | 4 - > sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- > sysdeps/x86_64/multiarch/strncmp.c | 4 - > sysdeps/x86_64/strcmp.S | 155 ++++-------------- > 10 files changed, 30 insertions(+), 202 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 51222dfab1..ed2def288d 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -58,7 +58,6 @@ sysdep_routines += \ > strcasecmp_l-evex \ > strcasecmp_l-sse2 \ > strcasecmp_l-sse4_2 \ > - strcasecmp_l-ssse3 \ > strcat-avx2 \ > strcat-avx2-rtm \ > strcat-evex \ > @@ -80,7 +79,6 @@ sysdep_routines += \ > strcmp-sse2 \ > strcmp-sse2-unaligned \ > strcmp-sse4_2 \ > - strcmp-ssse3 \ > strcpy-avx2 \ > strcpy-avx2-rtm \ > strcpy-evex \ > @@ -98,7 +96,6 @@ sysdep_routines += \ > strncase_l-evex \ > strncase_l-sse2 \ > strncase_l-sse4_2 \ > - strncase_l-ssse3 \ > strncat-avx2 \ > strncat-avx2-rtm \ > strncat-c \ > @@ -110,7 +107,6 @@ sysdep_routines += \ > strncmp-evex \ > strncmp-sse2 \ > strncmp-sse4_2 \ > - strncmp-ssse3 \ > strncpy-avx2 \ > strncpy-avx2-rtm \ > strncpy-c \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index f389928a4e..7e2be3554b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ > @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, > __strcasecmp_l_sse2)) > > @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strcmp_evex) > IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), > __strcmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), > - __strcmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) > > @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp, 1, > __strncasecmp_sse2)) > > @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, > __strncasecmp_l_sse2)) > > @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strncmp_evex) > IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), > __strncmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), > - __strncmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) > > #ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > index 766539c241..296d32071b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > @@ -20,7 +20,6 @@ > #include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > deleted file mode 100644 > index fb2f9ae14a..0000000000 > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strcasecmp_l_ssse3 > -#define __strcasecmp __strcasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S > deleted file mode 100644 > index 1b7fa33c91..0000000000 > --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S > +++ /dev/null > @@ -1,5 +0,0 @@ > -#if IS_IN (libc) > -# define USE_SSSE3 1 > -# define STRCMP __strcmp_ssse3 > -# include "../strcmp.S" > -#endif > diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c > index 68cb73baad..a248c2a6e6 100644 > --- a/sysdeps/x86_64/multiarch/strcmp.c > +++ b/sysdeps/x86_64/multiarch/strcmp.c > @@ -28,7 +28,6 @@ > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > deleted file mode 100644 > index 6728678688..0000000000 > --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRNCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strncasecmp_l_ssse3 > -#define __strncasecmp __strncasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S > deleted file mode 100644 > index ec37308347..0000000000 > --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S > +++ /dev/null > @@ -1,28 +0,0 @@ > -/* strcmp optimized with SSSE3. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#define STRCMP __strncmp_ssse3 > - > -#undef libc_hidden_builtin_def > -#define libc_hidden_builtin_def(strcmp) > - > -#define USE_SSSE3 1 > -#define USE_AS_STRNCMP > -#include <sysdeps/x86_64/strcmp.S> > diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c > index fca74199d8..70ae6547c9 100644 > --- a/sysdeps/x86_64/multiarch/strncmp.c > +++ b/sysdeps/x86_64/multiarch/strncmp.c > @@ -27,7 +27,6 @@ > # include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S > index 99d8b36f1d..c38dc627f9 100644 > --- a/sysdeps/x86_64/strcmp.S > +++ b/sysdeps/x86_64/strcmp.S > @@ -59,12 +59,7 @@ > # endif > #endif > > -#ifndef USE_SSSE3 > .text > -#else > - .section .text.ssse3,"ax",@progbits > -#endif > - > #ifdef USE_AS_STRCASECMP_L > # ifndef ENTRY2 > # define ENTRY2(name) ENTRY (name) > @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 19:55 ` H.J. Lu @ 2022-03-25 20:44 ` Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (4 more replies) 2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein ` (7 subsequent siblings) 9 siblings, 5 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - 5 files changed, 2006 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 6507d1b7fa..51222dfab1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -12,7 +12,6 @@ sysdep_routines += \ memcmp-evex-movbe \ memcmp-sse2 \ memcmp-sse4 \ - memcmp-ssse3 \ memcmpeq-avx2 \ memcmpeq-avx2-rtm \ memcmpeq-evex \ @@ -179,7 +178,6 @@ sysdep_routines += \ wmemcmp-c \ wmemcmp-evex-movbe \ wmemcmp-sse4 \ - wmemcmp-ssse3 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 40cc6cc49e..f389928a4e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), - __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) #ifdef SHARED @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), - __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) /* Support sysdeps/x86_64/multiarch/wmemset.c. */ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index cd12613699..44759a3ad5 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -20,7 +20,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S deleted file mode 100644 index df1b1fc494..0000000000 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ /dev/null @@ -1,1992 +0,0 @@ -/* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -# endif - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - atom_text_section -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP - test %RDX_LP, %RDX_LP - jz L(equal) -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -# endif - mov %rdx, %rcx - mov %rdi, %rdx - cmp $48, %rcx; - jae L(48bytesormore) /* LEN => 48 */ - - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -/* ECX >= 32. */ -L(48bytesormore): - movdqu (%rdi), %xmm3 - movdqu (%rsi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%rdi), %rdi - lea 16(%rsi), %rsi - sub $0xffff, %edx - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %rdx, %rdi - sub %rdx, %rsi - add %rdx, %rcx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %rdx, %rsi - -# ifndef USE_AS_WMEMCMP - cmp $8, %edx - jae L(next_unaligned_table) - cmp $0, %edx - je L(shr_0) - cmp $1, %edx - je L(shr_1) - cmp $2, %edx - je L(shr_2) - cmp $3, %edx - je L(shr_3) - cmp $4, %edx - je L(shr_4) - cmp $5, %edx - je L(shr_5) - cmp $6, %edx - je L(shr_6) - jmp L(shr_7) - - .p2align 2 -L(next_unaligned_table): - cmp $8, %edx - je L(shr_8) - cmp $9, %edx - je L(shr_9) - cmp $10, %edx - je L(shr_10) - cmp $11, %edx - je L(shr_11) - cmp $12, %edx - je L(shr_12) - cmp $13, %edx - je L(shr_13) - cmp $14, %edx - je L(shr_14) - jmp L(shr_15) -# else - cmp $0, %edx - je L(shr_0) - cmp $4, %edx - je L(shr_4) - cmp $8, %edx - je L(shr_8) - jmp L(shr_12) -# endif - - .p2align 4 -L(shr_0): - cmp $80, %rcx - lea -48(%rcx), %rcx - jae L(shr_0_gobble) - xor %eax, %eax - movdqa (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_0_gobble): - movdqa (%rsi), %xmm0 - xor %eax, %eax - pcmpeqb (%rdi), %xmm0 - sub $32, %rcx - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 -L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %rcx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%rsi), %xmm0 - movdqa 48(%rsi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%rdi), %xmm0 - pcmpeqb 48(%rdi), %xmm2 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %rcx - jge L(next) - inc %edx - add $32, %rcx -L(next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_1): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_1_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $1, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $1, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $1, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_1_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $1, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $1, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_1_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $1, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $1, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_1_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_1_gobble_next) - inc %edx - add $32, %rcx -L(shr_1_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 1(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - - .p2align 4 -L(shr_2): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $2, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $2, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_2_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $2, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $2, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $2, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $2, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_2_gobble_next) - inc %edx - add $32, %rcx -L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 2(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_3_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $3, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $3, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $3, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $3, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $3, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_3_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $3, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $3, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_3_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_3_gobble_next) - inc %edx - add $32, %rcx -L(shr_3_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 3(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_4): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $4, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $4, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_4_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $4, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $4, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $4, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $4, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_4_gobble_next) - inc %edx - add $32, %rcx -L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 4(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_5): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_5_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $5, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $5, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $5, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_5_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $5, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $5, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_5_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $5, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $5, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_5_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_5_gobble_next) - inc %edx - add $32, %rcx -L(shr_5_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 5(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $6, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $6, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $6, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $6, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $6, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $6, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_6_gobble_next) - inc %edx - add $32, %rcx -L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 6(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_7_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $7, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $7, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $7, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $7, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $7, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_7_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $7, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $7, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_7_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_7_gobble_next) - inc %edx - add $32, %rcx -L(shr_7_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 7(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_8): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $8, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $8, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_8_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $8, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $8, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $8, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $8, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_8_gobble_next) - inc %edx - add $32, %rcx -L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 8(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_9): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_9_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $9, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $9, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $9, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_9_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $9, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $9, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_9_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $9, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $9, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_9_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_9_gobble_next) - inc %edx - add $32, %rcx -L(shr_9_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 9(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $10, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $10, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $10, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $10, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $10, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $10, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_10_gobble_next) - inc %edx - add $32, %rcx -L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 10(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_11_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $11, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $11, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $11, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $11, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $11, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_11_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $11, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $11, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_11_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_11_gobble_next) - inc %edx - add $32, %rcx -L(shr_11_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 11(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_12): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $12, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_12_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $12, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $12, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $12, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $12, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_12_gobble_next) - inc %edx - add $32, %rcx -L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 12(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_13): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_13_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $13, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $13, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $13, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_13_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $13, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $13, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_13_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $13, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $13, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_13_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_13_gobble_next) - inc %edx - add $32, %rcx -L(shr_13_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 13(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $14, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $14, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $14, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $14, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $14, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_14_gobble_next) - inc %edx - add $32, %rcx -L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 14(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_15_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $15, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $15, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $15, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $15, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $15, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_15_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $15, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $15, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_15_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_15_gobble_next) - inc %edx - add $32, %rcx -L(shr_15_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 15(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) -# endif - .p2align 4 -L(exit): - pmovmskb %xmm1, %r8d - sub $0xffff, %r8d - jz L(first16bytes) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - mov %r8d, %edx -L(first16bytes): - add %rax, %rsi -L(less16bytes): -# ifndef USE_AS_WMEMCMP - test %dl, %dl - jz L(next_24_bytes) - - test $0x01, %dl - jnz L(Byte16) - - test $0x02, %dl - jnz L(Byte17) - - test $0x04, %dl - jnz L(Byte18) - - test $0x08, %dl - jnz L(Byte19) - - test $0x10, %dl - jnz L(Byte20) - - test $0x20, %dl - jnz L(Byte21) - - test $0x40, %dl - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte16): - movzbl -16(%rdi), %eax - movzbl -16(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte17): - movzbl -15(%rdi), %eax - movzbl -15(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte18): - movzbl -14(%rdi), %eax - movzbl -14(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte19): - movzbl -13(%rdi), %eax - movzbl -13(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte20): - movzbl -12(%rdi), %eax - movzbl -12(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte21): - movzbl -11(%rdi), %eax - movzbl -11(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte22): - movzbl -10(%rdi), %eax - movzbl -10(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(next_24_bytes): - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - test $0x01, %dh - jnz L(Byte16) - - test $0x02, %dh - jnz L(Byte17) - - test $0x04, %dh - jnz L(Byte18) - - test $0x08, %dh - jnz L(Byte19) - - test $0x10, %dh - jnz L(Byte20) - - test $0x20, %dh - jnz L(Byte21) - - test $0x40, %dh - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret -# else -/* special for wmemcmp */ - xor %eax, %eax - test %dl, %dl - jz L(next_two_double_words) - and $15, %dl - jz L(second_double_word) - mov -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(second_double_word): - mov -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(next_two_double_words): - and $15, %dh - jz L(fourth_double_word) - mov -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(fourth_double_word): - mov -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) - ret -# endif - - .p2align 4 -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $0, %ecx - je L(0bytes) -# ifndef USE_AS_WMEMCMP - cmp $1, %ecx - je L(1bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) -# else - jmp L(4bytes) -# endif - - .p2align 4 -L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) -# ifndef USE_AS_WMEMCMP - cmp $9, %ecx - je L(9bytes) - cmp $10, %ecx - je L(10bytes) - cmp $11, %ecx - je L(11bytes) - cmp $12, %ecx - je L(12bytes) - cmp $13, %ecx - je L(13bytes) - cmp $14, %ecx - je L(14bytes) - jmp L(15bytes) -# else - jmp L(12bytes) -# endif - - .p2align 4 -L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) -# ifndef USE_AS_WMEMCMP - cmp $17, %ecx - je L(17bytes) - cmp $18, %ecx - je L(18bytes) - cmp $19, %ecx - je L(19bytes) - cmp $20, %ecx - je L(20bytes) - cmp $21, %ecx - je L(21bytes) - cmp $22, %ecx - je L(22bytes) - jmp L(23bytes) -# else - jmp L(20bytes) -# endif - - .p2align 4 -L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) -# ifndef USE_AS_WMEMCMP - cmp $25, %ecx - je L(25bytes) - cmp $26, %ecx - je L(26bytes) - cmp $27, %ecx - je L(27bytes) - cmp $28, %ecx - je L(28bytes) - cmp $29, %ecx - je L(29bytes) - cmp $30, %ecx - je L(30bytes) - jmp L(31bytes) -# else - jmp L(28bytes) -# endif - - .p2align 4 -L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) -# ifndef USE_AS_WMEMCMP - cmp $33, %ecx - je L(33bytes) - cmp $34, %ecx - je L(34bytes) - cmp $35, %ecx - je L(35bytes) - cmp $36, %ecx - je L(36bytes) - cmp $37, %ecx - je L(37bytes) - cmp $38, %ecx - je L(38bytes) - jmp L(39bytes) -# else - jmp L(36bytes) -# endif - - .p2align 4 -L(more40bytes): - cmp $40, %ecx - je L(40bytes) -# ifndef USE_AS_WMEMCMP - cmp $41, %ecx - je L(41bytes) - cmp $42, %ecx - je L(42bytes) - cmp $43, %ecx - je L(43bytes) - cmp $44, %ecx - je L(44bytes) - cmp $45, %ecx - je L(45bytes) - cmp $46, %ecx - je L(46bytes) - jmp L(47bytes) - - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - movl -44(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - movl -40(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - movl -36(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - movl -32(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - movl -28(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - movl -24(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - movl -20(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - movl -16(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - movl -12(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - movl -8(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - movl -4(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# else - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - cmp -44(%rsi), %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - cmp -40(%rsi), %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - cmp -36(%rsi), %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - cmp -32(%rsi), %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - cmp -28(%rsi), %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - cmp -24(%rsi), %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - cmp -20(%rsi), %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(45bytes): - movl -45(%rdi), %eax - movl -45(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(41bytes): - movl -41(%rdi), %eax - movl -41(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(37bytes): - movl -37(%rdi), %eax - movl -37(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(33bytes): - movl -33(%rdi), %eax - movl -33(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(29bytes): - movl -29(%rdi), %eax - movl -29(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(25bytes): - movl -25(%rdi), %eax - movl -25(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(21bytes): - movl -21(%rdi), %eax - movl -21(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(17bytes): - movl -17(%rdi), %eax - movl -17(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(13bytes): - movl -13(%rdi), %eax - movl -13(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(9bytes): - movl -9(%rdi), %eax - movl -9(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(5bytes): - movl -5(%rdi), %eax - movl -5(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(1bytes): - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(46bytes): - movl -46(%rdi), %eax - movl -46(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(42bytes): - movl -42(%rdi), %eax - movl -42(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(38bytes): - movl -38(%rdi), %eax - movl -38(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(34bytes): - movl -34(%rdi), %eax - movl -34(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(30bytes): - movl -30(%rdi), %eax - movl -30(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(26bytes): - movl -26(%rdi), %eax - movl -26(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(22bytes): - movl -22(%rdi), %eax - movl -22(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(18bytes): - movl -18(%rdi), %eax - movl -18(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(14bytes): - movl -14(%rdi), %eax - movl -14(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(10bytes): - movl -10(%rdi), %eax - movl -10(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(6bytes): - movl -6(%rdi), %eax - movl -6(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(2bytes): - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(47bytes): - movl -47(%rdi), %eax - movl -47(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(43bytes): - movl -43(%rdi), %eax - movl -43(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(39bytes): - movl -39(%rdi), %eax - movl -39(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(35bytes): - movl -35(%rdi), %eax - movl -35(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(31bytes): - movl -31(%rdi), %eax - movl -31(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(27bytes): - movl -27(%rdi), %eax - movl -27(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(23bytes): - movl -23(%rdi), %eax - movl -23(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(19bytes): - movl -19(%rdi), %eax - movl -19(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(15bytes): - movl -15(%rdi), %eax - movl -15(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(11bytes): - movl -11(%rdi), %eax - movl -11(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(7bytes): - movl -7(%rdi), %eax - movl -7(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(3bytes): - movzwl -3(%rdi), %eax - movzwl -3(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(find_diff): - cmpb %cl, %al - jne L(set) - cmpw %cx, %ax - jne L(set) - shr $16, %eax - shr $16, %ecx - cmpb %cl, %al - jne L(set) - -/* We get there only if we already know there is a -difference. */ - - cmp %ecx, %eax -L(set): - sbb %eax, %eax - sbb $-1, %eax - ret -# else - -/* for wmemcmp */ - .p2align 4 -L(find_diff): - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret -# endif - - .p2align 4 -L(equal): - xor %eax, %eax - ret - -END (MEMCMP) -#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S deleted file mode 100644 index a41ef95fc1..0000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_ssse3 - -#include "memcmp-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-03-25 20:44 ` Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein ` (3 subsequent siblings) 4 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - sysdeps/x86_64/multiarch/strcmp.c | 4 - sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- sysdeps/x86_64/multiarch/strncmp.c | 4 - sysdeps/x86_64/strcmp.S | 155 ++++-------------- 10 files changed, 30 insertions(+), 202 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 51222dfab1..ed2def288d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -58,7 +58,6 @@ sysdep_routines += \ strcasecmp_l-evex \ strcasecmp_l-sse2 \ strcasecmp_l-sse4_2 \ - strcasecmp_l-ssse3 \ strcat-avx2 \ strcat-avx2-rtm \ strcat-evex \ @@ -80,7 +79,6 @@ sysdep_routines += \ strcmp-sse2 \ strcmp-sse2-unaligned \ strcmp-sse4_2 \ - strcmp-ssse3 \ strcpy-avx2 \ strcpy-avx2-rtm \ strcpy-evex \ @@ -98,7 +96,6 @@ sysdep_routines += \ strncase_l-evex \ strncase_l-sse2 \ strncase_l-sse4_2 \ - strncase_l-ssse3 \ strncat-avx2 \ strncat-avx2-rtm \ strncat-c \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncmp-evex \ strncmp-sse2 \ strncmp-sse4_2 \ - strncmp-ssse3 \ strncpy-avx2 \ strncpy-avx2-rtm \ strncpy-c \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f389928a4e..7e2be3554b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), __strcmp_sse42) - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), - __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), __strncmp_sse42) - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), - __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) #ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h index 766539c241..296d32071b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -20,7 +20,6 @@ #include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S deleted file mode 100644 index fb2f9ae14a..0000000000 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strcasecmp_l_ssse3 -#define __strcasecmp __strcasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S deleted file mode 100644 index 1b7fa33c91..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S +++ /dev/null @@ -1,5 +0,0 @@ -#if IS_IN (libc) -# define USE_SSSE3 1 -# define STRCMP __strcmp_ssse3 -# include "../strcmp.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 68cb73baad..a248c2a6e6 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -28,7 +28,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S deleted file mode 100644 index 6728678688..0000000000 --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRNCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strncasecmp_l_ssse3 -#define __strncasecmp __strncasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S deleted file mode 100644 index ec37308347..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S +++ /dev/null @@ -1,28 +0,0 @@ -/* strcmp optimized with SSSE3. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define STRCMP __strncmp_ssse3 - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(strcmp) - -#define USE_SSSE3 1 -#define USE_AS_STRNCMP -#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index fca74199d8..70ae6547c9 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -27,7 +27,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 99d8b36f1d..c38dc627f9 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -59,12 +59,7 @@ # endif #endif -#ifndef USE_SSSE3 .text -#else - .section .text.ssse3,"ax",@progbits -#endif - #ifdef USE_AS_STRCASECMP_L # ifndef ENTRY2 # define ENTRY2(name) ENTRY (name) @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-03-25 20:44 ` Noah Goldstein 2022-04-10 0:57 ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein ` (2 subsequent siblings) 4 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +- sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 -------------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 - 5 files changed, 7 insertions(+), 3183 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ed2def288d..48f81711ae 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,6 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3 \ memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ @@ -24,7 +23,6 @@ sysdep_routines += \ memmove-avx512-unaligned-erms \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ - memmove-ssse3 \ memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 7e2be3554b..70b0e9c62e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, @@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3_back) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned) @@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, @@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512F), __memcpy_avx512_no_vzeroupper) @@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, @@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..1ecdd4b0d3 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +92,15 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) + return OPTIMIZE (ssse3_back); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S deleted file mode 100644 index 65644d3a09..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3151 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif -L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(80bytesormore): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - - .p2align 4 -L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - - .p2align 4 -L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_bwd): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) -L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 -L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 -L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 -L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 -L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 -L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 -L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 -L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 -L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 -L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 -L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 -L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 -L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 -L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 -L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 -L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 -L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 -L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 -L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 -L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 -L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 -L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 -L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 -L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 -L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 -L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 -L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 -L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 -L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 -L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 -L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 -L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 -L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -L(write_0bytes): - ret - - .p2align 4 -L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif -L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#endif - .p2align 4 -L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif -L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - - .p2align 3 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S deleted file mode 100644 index 295430b1ef..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein @ 2022-04-10 0:57 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:57 UTC (permalink / raw) To: libc-alpha The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.962 bench-memcpy-random geometric_mean(N=50) New / Original: 0.895 bench-memcpy-large geometric_mean(N=50) New / Original: 0.894 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries. --- memcpy benchmarks comparisong memcpy-ssse3 before / after this patch. Results are from geomtric mean of N=50 runs on Zhaoxin KX-6840@2000MHz. bench-memcpy: length, align1, align2, dst > src, New Time / Old Time 1, 0, 0, 0, 2.099 1, 0, 0, 1, 2.099 1, 32, 0, 0, 2.103 1, 32, 0, 1, 2.103 1, 0, 32, 0, 2.099 1, 0, 32, 1, 2.098 1, 32, 32, 0, 2.098 1, 32, 32, 1, 2.098 1, 2048, 0, 0, 2.098 1, 2048, 0, 1, 2.098 2, 0, 0, 0, 1.135 2, 0, 0, 1, 1.136 2, 1, 0, 0, 1.139 2, 1, 0, 1, 1.139 2, 33, 0, 0, 1.165 2, 33, 0, 1, 1.139 2, 0, 1, 0, 1.136 2, 0, 1, 1, 1.136 2, 0, 33, 0, 1.136 2, 0, 33, 1, 1.136 2, 1, 1, 0, 1.136 2, 1, 1, 1, 1.136 2, 33, 33, 0, 1.136 2, 33, 33, 1, 1.136 2, 2048, 0, 0, 1.136 2, 2048, 0, 1, 1.136 2, 2049, 0, 0, 1.191 2, 2049, 0, 1, 1.139 2, 2048, 1, 0, 1.136 2, 2048, 1, 1, 1.136 2, 2049, 1, 0, 1.136 2, 2049, 1, 1, 1.136 4, 0, 0, 0, 1.074 4, 0, 0, 1, 0.962 4, 2, 0, 0, 0.973 4, 2, 0, 1, 0.989 4, 34, 0, 0, 0.991 4, 34, 0, 1, 0.991 4, 0, 2, 0, 0.962 4, 0, 2, 1, 0.962 4, 0, 34, 0, 0.962 4, 0, 34, 1, 0.962 4, 2, 2, 0, 0.962 4, 2, 2, 1, 0.962 4, 34, 34, 0, 0.962 4, 34, 34, 1, 0.962 4, 2048, 0, 0, 0.962 4, 2048, 0, 1, 0.962 4, 2050, 0, 0, 0.977 4, 2050, 0, 1, 0.979 4, 2048, 2, 0, 0.962 4, 2048, 2, 1, 0.962 4, 2050, 2, 0, 0.962 4, 2050, 2, 1, 0.962 8, 0, 0, 0, 0.961 8, 0, 0, 1, 0.962 8, 3, 0, 0, 1.0 8, 3, 0, 1, 1.0 8, 35, 0, 0, 1.0 8, 35, 0, 1, 1.0 8, 0, 3, 0, 0.962 8, 0, 3, 1, 0.962 8, 0, 35, 0, 0.962 8, 0, 35, 1, 0.962 8, 3, 3, 0, 0.962 8, 3, 3, 1, 0.962 8, 35, 35, 0, 0.962 8, 35, 35, 1, 0.962 8, 2048, 0, 0, 0.962 8, 2048, 0, 1, 0.962 8, 2051, 0, 0, 1.0 8, 2051, 0, 1, 1.0 8, 2048, 3, 0, 0.962 8, 2048, 3, 1, 0.962 8, 2051, 3, 0, 0.962 8, 2051, 3, 1, 0.962 16, 0, 0, 0, 0.798 16, 0, 0, 1, 0.799 16, 4, 0, 0, 0.8 16, 4, 0, 1, 0.801 16, 36, 0, 0, 0.801 16, 36, 0, 1, 0.8 16, 0, 4, 0, 0.798 16, 0, 4, 1, 0.798 16, 0, 36, 0, 0.798 16, 0, 36, 1, 0.798 16, 4, 4, 0, 0.798 16, 4, 4, 1, 0.798 16, 36, 36, 0, 0.798 16, 36, 36, 1, 0.798 16, 2048, 0, 0, 0.798 16, 2048, 0, 1, 0.799 16, 2052, 0, 0, 0.8 16, 2052, 0, 1, 0.8 16, 2048, 4, 0, 0.798 16, 2048, 4, 1, 0.798 16, 2052, 4, 0, 0.798 16, 2052, 4, 1, 0.798 32, 0, 0, 0, 0.471 32, 0, 0, 1, 0.471 32, 5, 0, 0, 0.471 32, 5, 0, 1, 0.471 32, 37, 0, 0, 0.961 32, 37, 0, 1, 0.961 32, 0, 5, 0, 0.471 32, 0, 5, 1, 0.471 32, 0, 37, 0, 1.021 32, 0, 37, 1, 1.021 32, 5, 5, 0, 0.471 32, 5, 5, 1, 0.471 32, 37, 37, 0, 1.011 32, 37, 37, 1, 1.011 32, 2048, 0, 0, 0.471 32, 2048, 0, 1, 0.471 32, 2053, 0, 0, 0.471 32, 2053, 0, 1, 0.471 32, 2048, 5, 0, 0.471 32, 2048, 5, 1, 0.471 32, 2053, 5, 0, 0.471 32, 2053, 5, 1, 0.471 64, 0, 0, 0, 1.0 64, 0, 0, 1, 1.0 64, 6, 0, 0, 0.862 64, 6, 0, 1, 0.862 64, 38, 0, 0, 0.912 64, 38, 0, 1, 0.912 64, 0, 6, 0, 0.896 64, 0, 6, 1, 0.896 64, 0, 38, 0, 0.906 64, 0, 38, 1, 0.906 64, 6, 6, 0, 0.91 64, 6, 6, 1, 0.91 64, 38, 38, 0, 0.883 64, 38, 38, 1, 0.883 64, 2048, 0, 0, 1.0 64, 2048, 0, 1, 1.0 64, 2054, 0, 0, 0.862 64, 2054, 0, 1, 0.862 64, 2048, 6, 0, 0.887 64, 2048, 6, 1, 0.887 64, 2054, 6, 0, 0.887 64, 2054, 6, 1, 0.887 128, 0, 0, 0, 0.857 128, 0, 0, 1, 0.857 128, 7, 0, 0, 0.875 128, 7, 0, 1, 0.875 128, 39, 0, 0, 0.892 128, 39, 0, 1, 0.892 128, 0, 7, 0, 1.183 128, 0, 7, 1, 1.183 128, 0, 39, 0, 1.113 128, 0, 39, 1, 1.113 128, 7, 7, 0, 0.692 128, 7, 7, 1, 0.692 128, 39, 39, 0, 1.104 128, 39, 39, 1, 1.104 128, 2048, 0, 0, 0.857 128, 2048, 0, 1, 0.857 128, 2055, 0, 0, 0.875 128, 2055, 0, 1, 0.875 128, 2048, 7, 0, 0.959 128, 2048, 7, 1, 0.959 128, 2055, 7, 0, 1.036 128, 2055, 7, 1, 1.036 256, 0, 0, 0, 0.889 256, 0, 0, 1, 0.889 256, 8, 0, 0, 0.966 256, 8, 0, 1, 0.966 256, 40, 0, 0, 0.983 256, 40, 0, 1, 0.983 256, 0, 8, 0, 1.29 256, 0, 8, 1, 1.29 256, 0, 40, 0, 1.274 256, 0, 40, 1, 1.274 256, 8, 8, 0, 0.865 256, 8, 8, 1, 0.865 256, 40, 40, 0, 1.477 256, 40, 40, 1, 1.477 256, 2048, 0, 0, 0.889 256, 2048, 0, 1, 0.889 256, 2056, 0, 0, 0.966 256, 2056, 0, 1, 0.966 256, 2048, 8, 0, 0.952 256, 2048, 8, 1, 0.952 256, 2056, 8, 0, 0.878 256, 2056, 8, 1, 0.878 512, 0, 0, 0, 1.077 512, 0, 0, 1, 1.077 512, 9, 0, 0, 1.001 512, 9, 0, 1, 1.0 512, 41, 0, 0, 0.954 512, 41, 0, 1, 0.954 512, 0, 9, 0, 1.191 512, 0, 9, 1, 1.191 512, 0, 41, 0, 1.181 512, 0, 41, 1, 1.181 512, 9, 9, 0, 0.765 512, 9, 9, 1, 0.765 512, 41, 41, 0, 0.905 512, 41, 41, 1, 0.905 512, 2048, 0, 0, 1.077 512, 2048, 0, 1, 1.077 512, 2057, 0, 0, 1.0 512, 2057, 0, 1, 1.0 512, 2048, 9, 0, 1.0 512, 2048, 9, 1, 1.0 512, 2057, 9, 0, 0.733 512, 2057, 9, 1, 0.733 1024, 0, 0, 0, 1.143 1024, 0, 0, 1, 1.143 1024, 10, 0, 0, 1.015 1024, 10, 0, 1, 1.015 1024, 42, 0, 0, 1.045 1024, 42, 0, 1, 1.045 1024, 0, 10, 0, 1.126 1024, 0, 10, 1, 1.126 1024, 0, 42, 0, 1.114 1024, 0, 42, 1, 1.114 1024, 10, 10, 0, 0.89 1024, 10, 10, 1, 0.89 1024, 42, 42, 0, 0.986 1024, 42, 42, 1, 0.986 1024, 2048, 0, 0, 1.143 1024, 2048, 0, 1, 1.143 1024, 2058, 0, 0, 1.015 1024, 2058, 0, 1, 1.015 1024, 2048, 10, 0, 1.03 1024, 2048, 10, 1, 1.03 1024, 2058, 10, 0, 0.854 1024, 2058, 10, 1, 0.854 2048, 0, 0, 0, 1.005 2048, 0, 0, 1, 1.005 2048, 11, 0, 0, 1.013 2048, 11, 0, 1, 1.014 2048, 43, 0, 0, 1.044 2048, 43, 0, 1, 1.044 2048, 0, 11, 0, 1.003 2048, 0, 11, 1, 1.003 2048, 0, 43, 0, 1.003 2048, 0, 43, 1, 1.003 2048, 11, 11, 0, 0.92 2048, 11, 11, 1, 0.92 2048, 43, 43, 0, 1.0 2048, 43, 43, 1, 1.0 2048, 2048, 0, 0, 1.005 2048, 2048, 0, 1, 1.005 2048, 2059, 0, 0, 0.904 2048, 2059, 0, 1, 0.904 2048, 2048, 11, 0, 1.0 2048, 2048, 11, 1, 1.0 2048, 2059, 11, 0, 0.979 2048, 2059, 11, 1, 0.979 4096, 0, 0, 0, 1.014 4096, 0, 0, 1, 1.014 4096, 12, 0, 0, 0.855 4096, 12, 0, 1, 0.855 4096, 44, 0, 0, 0.857 4096, 44, 0, 1, 0.857 4096, 0, 12, 0, 0.932 4096, 0, 12, 1, 0.932 4096, 0, 44, 0, 0.932 4096, 0, 44, 1, 0.932 4096, 12, 12, 0, 0.999 4096, 12, 12, 1, 0.999 4096, 44, 44, 0, 1.051 4096, 44, 44, 1, 1.051 4096, 2048, 0, 0, 1.014 4096, 2048, 0, 1, 1.014 4096, 2060, 0, 0, 0.98 4096, 2060, 0, 1, 0.98 4096, 2048, 12, 0, 0.77 4096, 2048, 12, 1, 0.77 4096, 2060, 12, 0, 0.943 4096, 2060, 12, 1, 0.943 8192, 0, 0, 0, 1.046 8192, 0, 0, 1, 1.046 8192, 13, 0, 0, 0.885 8192, 13, 0, 1, 0.885 8192, 45, 0, 0, 0.887 8192, 45, 0, 1, 0.886 8192, 0, 13, 0, 0.942 8192, 0, 13, 1, 0.942 8192, 0, 45, 0, 0.942 8192, 0, 45, 1, 0.942 8192, 13, 13, 0, 1.03 8192, 13, 13, 1, 1.03 8192, 45, 45, 0, 1.048 8192, 45, 45, 1, 1.048 8192, 2048, 0, 0, 1.048 8192, 2048, 0, 1, 1.048 8192, 2061, 0, 0, 1.011 8192, 2061, 0, 1, 1.011 8192, 2048, 13, 0, 0.789 8192, 2048, 13, 1, 0.789 8192, 2061, 13, 0, 0.991 8192, 2061, 13, 1, 0.991 16384, 0, 0, 0, 1.014 16384, 0, 0, 1, 1.008 16384, 14, 0, 0, 0.951 16384, 14, 0, 1, 0.95 16384, 46, 0, 0, 0.874 16384, 46, 0, 1, 0.871 16384, 0, 14, 0, 0.813 16384, 0, 14, 1, 0.81 16384, 0, 46, 0, 0.85 16384, 0, 46, 1, 0.86 16384, 14, 14, 0, 0.985 16384, 14, 14, 1, 0.975 16384, 46, 46, 0, 1.025 16384, 46, 46, 1, 1.027 16384, 2048, 0, 0, 1.058 16384, 2048, 0, 1, 1.058 16384, 2062, 0, 0, 0.849 16384, 2062, 0, 1, 0.848 16384, 2048, 14, 0, 0.907 16384, 2048, 14, 1, 0.907 16384, 2062, 14, 0, 0.988 16384, 2062, 14, 1, 0.995 32768, 0, 0, 0, 0.979 32768, 0, 0, 1, 0.979 32768, 15, 0, 0, 1.006 32768, 15, 0, 1, 1.006 32768, 47, 0, 0, 1.004 32768, 47, 0, 1, 1.004 32768, 0, 15, 0, 1.045 32768, 0, 15, 1, 1.045 32768, 0, 47, 0, 1.011 32768, 0, 47, 1, 1.012 32768, 15, 15, 0, 0.977 32768, 15, 15, 1, 0.977 32768, 47, 47, 0, 0.96 32768, 47, 47, 1, 0.96 32768, 2048, 0, 0, 0.978 32768, 2048, 0, 1, 0.978 32768, 2063, 0, 0, 1.004 32768, 2063, 0, 1, 1.004 32768, 2048, 15, 0, 1.036 32768, 2048, 15, 1, 1.036 32768, 2063, 15, 0, 0.978 32768, 2063, 15, 1, 0.978 65536, 0, 0, 0, 0.981 65536, 0, 0, 1, 0.981 65536, 16, 0, 0, 0.987 65536, 16, 0, 1, 0.987 65536, 48, 0, 0, 0.968 65536, 48, 0, 1, 0.968 65536, 0, 16, 0, 1.014 65536, 0, 16, 1, 1.014 65536, 0, 48, 0, 0.984 65536, 0, 48, 1, 0.984 65536, 16, 16, 0, 1.01 65536, 16, 16, 1, 1.01 65536, 48, 48, 0, 0.968 65536, 48, 48, 1, 0.968 65536, 2048, 0, 0, 0.982 65536, 2048, 0, 1, 0.982 65536, 2064, 0, 0, 0.987 65536, 2064, 0, 1, 0.987 65536, 2048, 16, 0, 1.012 65536, 2048, 16, 1, 1.012 65536, 2064, 16, 0, 1.007 65536, 2064, 16, 1, 1.007 0, 0, 0, 0, 2.104 0, 2048, 0, 0, 2.104 0, 4095, 0, 0, 2.109 0, 0, 4095, 0, 2.103 1, 1, 0, 0, 2.104 1, 0, 1, 0, 2.098 1, 1, 1, 0, 2.098 1, 2049, 0, 0, 2.102 1, 2048, 1, 0, 2.098 1, 2049, 1, 0, 2.098 1, 4095, 0, 0, 2.103 1, 0, 4095, 0, 2.098 2, 2, 0, 0, 1.139 2, 0, 2, 0, 1.136 2, 2, 2, 0, 1.136 2, 2050, 0, 0, 1.139 2, 2048, 2, 0, 1.136 2, 2050, 2, 0, 1.136 2, 4095, 0, 0, 1.0 2, 0, 4095, 0, 1.022 3, 0, 0, 0, 0.981 3, 3, 0, 0, 0.984 3, 0, 3, 0, 0.982 3, 3, 3, 0, 0.982 3, 2048, 0, 0, 0.982 3, 2051, 0, 0, 0.983 3, 2048, 3, 0, 0.982 3, 2051, 3, 0, 0.982 3, 4095, 0, 0, 0.285 3, 0, 4095, 0, 0.231 4, 4, 0, 0, 1.373 4, 0, 4, 0, 1.31 4, 4, 4, 0, 1.282 4, 2052, 0, 0, 1.264 4, 2048, 4, 0, 1.254 4, 2052, 4, 0, 1.254 4, 4095, 0, 0, 1.971 4, 0, 4095, 0, 1.994 5, 0, 0, 0, 1.145 5, 5, 0, 0, 1.155 5, 0, 5, 0, 1.171 5, 5, 5, 0, 1.171 5, 2048, 0, 0, 1.197 5, 2053, 0, 0, 1.173 5, 2048, 5, 0, 1.171 5, 2053, 5, 0, 1.171 5, 4095, 0, 0, 0.935 5, 0, 4095, 0, 1.017 6, 0, 0, 0, 1.145 6, 6, 0, 0, 1.098 6, 0, 6, 0, 1.096 6, 6, 6, 0, 1.096 6, 2048, 0, 0, 1.12 6, 2054, 0, 0, 1.122 6, 2048, 6, 0, 1.12 6, 2054, 6, 0, 1.096 6, 4095, 0, 0, 0.935 6, 0, 4095, 0, 1.018 7, 0, 0, 0, 1.071 7, 7, 0, 0, 1.074 7, 0, 7, 0, 1.072 7, 7, 7, 0, 1.072 7, 2048, 0, 0, 1.096 7, 2055, 0, 0, 1.098 7, 2048, 7, 0, 1.096 7, 2055, 7, 0, 1.096 7, 4095, 0, 0, 0.935 7, 0, 4095, 0, 1.016 8, 8, 0, 0, 1.167 8, 0, 8, 0, 1.028 8, 8, 8, 0, 1.028 8, 2056, 0, 0, 1.069 8, 2048, 8, 0, 1.028 8, 2056, 8, 0, 1.028 8, 4095, 0, 0, 1.029 8, 0, 4095, 0, 1.043 9, 0, 0, 0, 0.799 9, 9, 0, 0, 0.801 9, 0, 9, 0, 0.799 9, 9, 9, 0, 0.799 9, 2048, 0, 0, 0.8 9, 2057, 0, 0, 0.801 9, 2048, 9, 0, 0.8 9, 2057, 9, 0, 0.799 9, 4095, 0, 0, 0.909 9, 0, 4095, 0, 1.0 10, 0, 0, 0, 0.799 10, 10, 0, 0, 0.801 10, 0, 10, 0, 0.8 10, 10, 10, 0, 0.8 10, 2048, 0, 0, 0.8 10, 2058, 0, 0, 0.801 10, 2048, 10, 0, 0.8 10, 2058, 10, 0, 0.8 10, 4095, 0, 0, 0.909 10, 0, 4095, 0, 1.0 11, 0, 0, 0, 0.799 11, 11, 0, 0, 0.801 11, 0, 11, 0, 0.8 11, 11, 11, 0, 0.8 11, 2048, 0, 0, 0.8 11, 2059, 0, 0, 0.802 11, 2048, 11, 0, 0.8 11, 2059, 11, 0, 0.8 11, 4095, 0, 0, 0.909 11, 0, 4095, 0, 1.0 12, 0, 0, 0, 0.799 12, 12, 0, 0, 0.801 12, 0, 12, 0, 0.8 12, 12, 12, 0, 0.8 12, 2048, 0, 0, 0.8 12, 2060, 0, 0, 0.802 12, 2048, 12, 0, 0.8 12, 2060, 12, 0, 0.8 12, 4095, 0, 0, 0.909 12, 0, 4095, 0, 1.0 13, 0, 0, 0, 0.798 13, 13, 0, 0, 0.801 13, 0, 13, 0, 0.799 13, 13, 13, 0, 0.799 13, 2048, 0, 0, 0.8 13, 2061, 0, 0, 0.801 13, 2048, 13, 0, 0.8 13, 2061, 13, 0, 0.8 13, 4095, 0, 0, 0.909 13, 0, 4095, 0, 1.0 14, 0, 0, 0, 0.799 14, 14, 0, 0, 0.801 14, 0, 14, 0, 0.8 14, 14, 14, 0, 0.8 14, 2048, 0, 0, 0.8 14, 2062, 0, 0, 0.801 14, 2048, 14, 0, 0.8 14, 2062, 14, 0, 0.8 14, 4095, 0, 0, 0.909 14, 0, 4095, 0, 1.0 15, 0, 0, 0, 0.799 15, 15, 0, 0, 0.801 15, 0, 15, 0, 0.8 15, 15, 15, 0, 0.8 15, 2048, 0, 0, 0.8 15, 2063, 0, 0, 0.802 15, 2048, 15, 0, 0.8 15, 2063, 15, 0, 0.8 15, 4095, 0, 0, 0.909 15, 0, 4095, 0, 1.0 16, 16, 0, 0, 0.801 16, 0, 16, 0, 0.799 16, 16, 16, 0, 0.799 16, 2064, 0, 0, 0.801 16, 2048, 16, 0, 0.798 16, 2064, 16, 0, 0.798 16, 4095, 0, 0, 1.818 16, 0, 4095, 0, 1.957 17, 0, 0, 0, 0.798 17, 17, 0, 0, 0.8 17, 0, 17, 0, 0.799 17, 17, 17, 0, 0.798 17, 2048, 0, 0, 0.798 17, 2065, 0, 0, 0.8 17, 2048, 17, 0, 0.798 17, 2065, 17, 0, 0.799 17, 4095, 0, 0, 0.937 17, 0, 4095, 0, 1.021 18, 0, 0, 0, 0.798 18, 18, 0, 0, 0.801 18, 0, 18, 0, 0.798 18, 18, 18, 0, 0.798 18, 2048, 0, 0, 0.799 18, 2066, 0, 0, 0.8 18, 2048, 18, 0, 0.798 18, 2066, 18, 0, 0.798 18, 4095, 0, 0, 0.937 18, 0, 4095, 0, 1.021 19, 0, 0, 0, 0.798 19, 19, 0, 0, 0.8 19, 0, 19, 0, 0.798 19, 19, 19, 0, 0.798 19, 2048, 0, 0, 0.798 19, 2067, 0, 0, 0.8 19, 2048, 19, 0, 0.798 19, 2067, 19, 0, 0.798 19, 4095, 0, 0, 0.937 19, 0, 4095, 0, 1.021 20, 0, 0, 0, 0.798 20, 20, 0, 0, 0.8 20, 0, 20, 0, 0.798 20, 20, 20, 0, 0.798 20, 2048, 0, 0, 0.798 20, 2068, 0, 0, 0.8 20, 2048, 20, 0, 0.798 20, 2068, 20, 0, 0.798 20, 4095, 0, 0, 0.937 20, 0, 4095, 0, 1.021 21, 0, 0, 0, 0.798 21, 21, 0, 0, 0.801 21, 0, 21, 0, 0.798 21, 21, 21, 0, 0.798 21, 2048, 0, 0, 0.798 21, 2069, 0, 0, 0.801 21, 2048, 21, 0, 0.799 21, 2069, 21, 0, 0.798 21, 4095, 0, 0, 0.937 21, 0, 4095, 0, 1.021 22, 0, 0, 0, 0.798 22, 22, 0, 0, 0.801 22, 0, 22, 0, 0.798 22, 22, 22, 0, 0.798 22, 2048, 0, 0, 0.798 22, 2070, 0, 0, 0.801 22, 2048, 22, 0, 0.798 22, 2070, 22, 0, 0.798 22, 4095, 0, 0, 0.937 22, 0, 4095, 0, 1.021 23, 0, 0, 0, 0.798 23, 23, 0, 0, 0.8 23, 0, 23, 0, 0.798 23, 23, 23, 0, 0.798 23, 2048, 0, 0, 0.798 23, 2071, 0, 0, 0.8 23, 2048, 23, 0, 0.798 23, 2071, 23, 0, 0.798 23, 4095, 0, 0, 0.937 23, 0, 4095, 0, 1.021 24, 0, 0, 0, 0.798 24, 24, 0, 0, 0.8 24, 0, 24, 0, 0.799 24, 24, 24, 0, 0.798 24, 2048, 0, 0, 0.798 24, 2072, 0, 0, 0.801 24, 2048, 24, 0, 0.798 24, 2072, 24, 0, 0.798 24, 4095, 0, 0, 0.937 24, 0, 4095, 0, 1.021 25, 0, 0, 0, 0.5 25, 25, 0, 0, 0.5 25, 0, 25, 0, 0.5 25, 25, 25, 0, 0.5 25, 2048, 0, 0, 0.5 25, 2073, 0, 0, 0.501 25, 2048, 25, 0, 0.5 25, 2073, 25, 0, 0.5 25, 4095, 0, 0, 0.974 25, 0, 4095, 0, 0.98 26, 0, 0, 0, 0.5 26, 26, 0, 0, 0.501 26, 0, 26, 0, 0.5 26, 26, 26, 0, 0.501 26, 2048, 0, 0, 0.5 26, 2074, 0, 0, 0.5 26, 2048, 26, 0, 0.5 26, 2074, 26, 0, 0.5 26, 4095, 0, 0, 0.974 26, 0, 4095, 0, 1.0 27, 0, 0, 0, 0.5 27, 27, 0, 0, 0.501 27, 0, 27, 0, 0.5 27, 27, 27, 0, 0.5 27, 2048, 0, 0, 0.5 27, 2075, 0, 0, 0.5 27, 2048, 27, 0, 0.5 27, 2075, 27, 0, 0.5 27, 4095, 0, 0, 0.974 27, 0, 4095, 0, 1.0 28, 0, 0, 0, 0.5 28, 28, 0, 0, 0.501 28, 0, 28, 0, 0.5 28, 28, 28, 0, 0.5 28, 2048, 0, 0, 0.5 28, 2076, 0, 0, 0.5 28, 2048, 28, 0, 0.5 28, 2076, 28, 0, 0.5 28, 4095, 0, 0, 0.974 28, 0, 4095, 0, 1.0 29, 0, 0, 0, 0.471 29, 29, 0, 0, 0.471 29, 0, 29, 0, 0.471 29, 29, 29, 0, 0.471 29, 2048, 0, 0, 0.471 29, 2077, 0, 0, 0.471 29, 2048, 29, 0, 0.471 29, 2077, 29, 0, 0.471 29, 4095, 0, 0, 0.974 29, 0, 4095, 0, 1.0 30, 0, 0, 0, 0.471 30, 30, 0, 0, 0.471 30, 0, 30, 0, 0.471 30, 30, 30, 0, 0.471 30, 2048, 0, 0, 0.471 30, 2078, 0, 0, 0.471 30, 2048, 30, 0, 0.471 30, 2078, 30, 0, 0.471 30, 4095, 0, 0, 0.974 30, 0, 4095, 0, 1.0 31, 0, 0, 0, 0.471 31, 31, 0, 0, 0.471 31, 0, 31, 0, 0.471 31, 31, 31, 0, 0.471 31, 2048, 0, 0, 0.471 31, 2079, 0, 0, 0.471 31, 2048, 31, 0, 0.471 31, 2079, 31, 0, 0.471 31, 4095, 0, 0, 0.974 31, 0, 4095, 0, 1.0 48, 0, 0, 0, 1.0 48, 0, 0, 1, 1.0 48, 3, 0, 0, 1.0 48, 3, 0, 1, 1.0 48, 0, 3, 0, 1.0 48, 0, 3, 1, 1.0 48, 3, 3, 0, 1.0 48, 3, 3, 1, 1.0 48, 2048, 0, 0, 1.0 48, 2048, 0, 1, 1.0 48, 2051, 0, 0, 1.0 48, 2051, 0, 1, 1.0 48, 2048, 3, 0, 1.0 48, 2048, 3, 1, 1.0 48, 2051, 3, 0, 1.0 48, 2051, 3, 1, 1.0 80, 0, 0, 0, 0.781 80, 0, 0, 1, 0.782 80, 5, 0, 0, 0.976 80, 5, 0, 1, 0.976 80, 0, 5, 0, 1.232 80, 0, 5, 1, 1.232 80, 5, 5, 0, 1.542 80, 5, 5, 1, 1.543 80, 2048, 0, 0, 0.781 80, 2048, 0, 1, 0.782 80, 2053, 0, 0, 0.976 80, 2053, 0, 1, 0.976 80, 2048, 5, 0, 1.093 80, 2048, 5, 1, 1.093 80, 2053, 5, 0, 1.371 80, 2053, 5, 1, 1.371 96, 0, 0, 0, 0.758 96, 0, 0, 1, 0.758 96, 6, 0, 0, 0.929 96, 6, 0, 1, 0.929 96, 0, 6, 0, 1.204 96, 0, 6, 1, 1.204 96, 6, 6, 0, 1.562 96, 6, 6, 1, 1.562 96, 2048, 0, 0, 0.758 96, 2048, 0, 1, 0.758 96, 2054, 0, 0, 0.929 96, 2054, 0, 1, 0.929 96, 2048, 6, 0, 1.068 96, 2048, 6, 1, 1.068 96, 2054, 6, 0, 1.562 96, 2054, 6, 1, 1.562 112, 0, 0, 0, 0.736 112, 0, 0, 1, 0.736 112, 7, 0, 0, 0.675 112, 7, 0, 1, 0.675 112, 0, 7, 0, 0.778 112, 0, 7, 1, 0.778 112, 7, 7, 0, 0.909 112, 7, 7, 1, 0.909 112, 2048, 0, 0, 0.736 112, 2048, 0, 1, 0.736 112, 2055, 0, 0, 0.675 112, 2055, 0, 1, 0.675 112, 2048, 7, 0, 0.778 112, 2048, 7, 1, 0.778 112, 2055, 7, 0, 0.909 112, 2055, 7, 1, 0.909 144, 0, 0, 0, 0.857 144, 0, 0, 1, 0.857 144, 9, 0, 0, 0.941 144, 9, 0, 1, 0.943 144, 0, 9, 0, 1.137 144, 0, 9, 1, 1.137 144, 9, 9, 0, 1.514 144, 9, 9, 1, 1.514 144, 2048, 0, 0, 0.857 144, 2048, 0, 1, 0.857 144, 2057, 0, 0, 0.939 144, 2057, 0, 1, 0.945 144, 2048, 9, 0, 0.922 144, 2048, 9, 1, 0.922 144, 2057, 9, 0, 1.514 144, 2057, 9, 1, 1.514 160, 0, 0, 0, 0.698 160, 0, 0, 1, 0.698 160, 10, 0, 0, 0.91 160, 10, 0, 1, 0.91 160, 0, 10, 0, 1.211 160, 0, 10, 1, 1.212 160, 10, 10, 0, 1.357 160, 10, 10, 1, 1.357 160, 2048, 0, 0, 0.698 160, 2048, 0, 1, 0.698 160, 2058, 0, 0, 0.91 160, 2058, 0, 1, 0.91 160, 2048, 10, 0, 0.923 160, 2048, 10, 1, 0.923 160, 2058, 10, 0, 1.357 160, 2058, 10, 1, 1.357 176, 0, 0, 0, 0.796 176, 0, 0, 1, 0.796 176, 11, 0, 0, 0.804 176, 11, 0, 1, 0.804 176, 0, 11, 0, 0.774 176, 0, 11, 1, 0.774 176, 11, 11, 0, 0.814 176, 11, 11, 1, 0.814 176, 2048, 0, 0, 0.796 176, 2048, 0, 1, 0.796 176, 2059, 0, 0, 0.804 176, 2059, 0, 1, 0.804 176, 2048, 11, 0, 0.774 176, 2048, 11, 1, 0.774 176, 2059, 11, 0, 0.814 176, 2059, 11, 1, 0.814 192, 0, 0, 0, 0.778 192, 0, 0, 1, 0.778 192, 12, 0, 0, 0.881 192, 12, 0, 1, 0.881 192, 0, 12, 0, 1.167 192, 0, 12, 1, 1.167 192, 12, 12, 0, 0.841 192, 12, 12, 1, 0.841 192, 2048, 0, 0, 0.778 192, 2048, 0, 1, 0.778 192, 2060, 0, 0, 0.881 192, 2060, 0, 1, 0.881 192, 2048, 12, 0, 0.889 192, 2048, 12, 1, 0.889 192, 2060, 12, 0, 0.906 192, 2060, 12, 1, 0.906 208, 0, 0, 0, 0.833 208, 0, 0, 1, 0.833 208, 13, 0, 0, 0.921 208, 13, 0, 1, 0.921 208, 0, 13, 0, 0.835 208, 0, 13, 1, 0.833 208, 13, 13, 0, 1.333 208, 13, 13, 1, 1.333 208, 2048, 0, 0, 0.833 208, 2048, 0, 1, 0.833 208, 2061, 0, 0, 0.921 208, 2061, 0, 1, 0.921 208, 2048, 13, 0, 0.833 208, 2048, 13, 1, 0.833 208, 2061, 13, 0, 1.333 208, 2061, 13, 1, 1.333 224, 0, 0, 0, 0.93 224, 0, 0, 1, 0.93 224, 14, 0, 0, 1.0 224, 14, 0, 1, 1.0 224, 0, 14, 0, 1.15 224, 0, 14, 1, 1.15 224, 14, 14, 0, 1.452 224, 14, 14, 1, 1.452 224, 2048, 0, 0, 0.93 224, 2048, 0, 1, 0.93 224, 2062, 0, 0, 1.0 224, 2062, 0, 1, 1.0 224, 2048, 14, 0, 0.833 224, 2048, 14, 1, 0.833 224, 2062, 14, 0, 1.452 224, 2062, 14, 1, 1.452 240, 0, 0, 0, 0.909 240, 0, 0, 1, 0.909 240, 15, 0, 0, 0.797 240, 15, 0, 1, 0.797 240, 0, 15, 0, 0.771 240, 0, 15, 1, 0.771 240, 15, 15, 0, 0.93 240, 15, 15, 1, 0.93 240, 2048, 0, 0, 0.909 240, 2048, 0, 1, 0.909 240, 2063, 0, 0, 0.797 240, 2063, 0, 1, 0.797 240, 2048, 15, 0, 0.771 240, 2048, 15, 1, 0.771 240, 2063, 15, 0, 0.93 240, 2063, 15, 1, 0.93 272, 0, 0, 0, 0.9 272, 0, 0, 1, 0.9 272, 17, 0, 0, 1.015 272, 17, 0, 1, 1.015 272, 0, 17, 0, 0.926 272, 0, 17, 1, 0.927 272, 17, 17, 0, 0.892 272, 17, 17, 1, 0.892 272, 2048, 0, 0, 0.9 272, 2048, 0, 1, 0.9 272, 2065, 0, 0, 1.015 272, 2065, 0, 1, 1.015 272, 2048, 17, 0, 0.927 272, 2048, 17, 1, 0.927 272, 2065, 17, 0, 0.878 272, 2065, 17, 1, 0.878 288, 0, 0, 0, 0.882 288, 0, 0, 1, 0.882 288, 18, 0, 0, 0.803 288, 18, 0, 1, 0.803 288, 0, 18, 0, 0.768 288, 0, 18, 1, 0.768 288, 18, 18, 0, 0.882 288, 18, 18, 1, 0.882 288, 2048, 0, 0, 0.882 288, 2048, 0, 1, 0.882 288, 2066, 0, 0, 0.803 288, 2066, 0, 1, 0.803 288, 2048, 18, 0, 0.768 288, 2048, 18, 1, 0.768 288, 2066, 18, 0, 0.882 288, 2066, 18, 1, 0.882 304, 0, 0, 0, 0.865 304, 0, 0, 1, 0.865 304, 19, 0, 0, 0.944 304, 19, 0, 1, 0.944 304, 0, 19, 0, 0.943 304, 0, 19, 1, 0.943 304, 19, 19, 0, 0.956 304, 19, 19, 1, 0.956 304, 2048, 0, 0, 0.866 304, 2048, 0, 1, 0.865 304, 2067, 0, 0, 0.944 304, 2067, 0, 1, 0.944 304, 2048, 19, 0, 0.943 304, 2048, 19, 1, 0.943 304, 2067, 19, 0, 0.947 304, 2067, 19, 1, 0.947 320, 0, 0, 0, 0.944 320, 0, 0, 1, 0.944 320, 20, 0, 0, 0.962 320, 20, 0, 1, 0.962 320, 0, 20, 0, 1.214 320, 0, 20, 1, 1.214 320, 20, 20, 0, 1.365 320, 20, 20, 1, 1.365 320, 2048, 0, 0, 0.943 320, 2048, 0, 1, 0.943 320, 2068, 0, 0, 0.962 320, 2068, 0, 1, 0.962 320, 2048, 20, 0, 0.914 320, 2048, 20, 1, 0.914 320, 2068, 20, 0, 1.365 320, 2068, 20, 1, 1.365 336, 0, 0, 0, 1.0 336, 0, 0, 1, 1.0 336, 21, 0, 0, 0.986 336, 21, 0, 1, 0.986 336, 0, 21, 0, 0.853 336, 0, 21, 1, 0.853 336, 21, 21, 0, 0.843 336, 21, 21, 1, 0.843 336, 2048, 0, 0, 1.0 336, 2048, 0, 1, 1.0 336, 2069, 0, 0, 0.986 336, 2069, 0, 1, 0.986 336, 2048, 21, 0, 0.853 336, 2048, 21, 1, 0.853 336, 2069, 21, 0, 0.831 336, 2069, 21, 1, 0.831 352, 0, 0, 0, 0.98 352, 0, 0, 1, 0.98 352, 22, 0, 0, 0.811 352, 22, 0, 1, 0.811 352, 0, 22, 0, 0.882 352, 0, 22, 1, 0.882 352, 22, 22, 0, 1.1 352, 22, 22, 1, 1.1 352, 2048, 0, 0, 0.98 352, 2048, 0, 1, 0.98 352, 2070, 0, 0, 0.811 352, 2070, 0, 1, 0.811 352, 2048, 22, 0, 0.882 352, 2048, 22, 1, 0.882 352, 2070, 22, 0, 1.1 352, 2070, 22, 1, 1.1 368, 0, 0, 0, 1.058 368, 0, 0, 1, 1.058 368, 23, 0, 0, 1.0 368, 23, 0, 1, 1.0 368, 0, 23, 0, 0.948 368, 0, 23, 1, 0.948 368, 23, 23, 0, 0.723 368, 23, 23, 1, 0.723 368, 2048, 0, 0, 1.058 368, 2048, 0, 1, 1.058 368, 2071, 0, 0, 1.0 368, 2071, 0, 1, 1.0 368, 2048, 23, 0, 0.948 368, 2048, 23, 1, 0.948 368, 2071, 23, 0, 0.701 368, 2071, 23, 1, 0.701 384, 0, 0, 0, 1.012 384, 0, 0, 1, 1.012 384, 24, 0, 0, 1.04 384, 24, 0, 1, 1.04 384, 0, 24, 0, 1.154 384, 0, 24, 1, 1.154 384, 24, 24, 0, 1.423 384, 24, 24, 1, 1.423 384, 2048, 0, 0, 1.012 384, 2048, 0, 1, 1.012 384, 2072, 0, 0, 1.04 384, 2072, 0, 1, 1.04 384, 2048, 24, 0, 0.91 384, 2048, 24, 1, 0.91 384, 2072, 24, 0, 1.423 384, 2072, 24, 1, 1.423 400, 0, 0, 0, 0.948 400, 0, 0, 1, 0.948 400, 25, 0, 0, 0.957 400, 25, 0, 1, 0.957 400, 0, 25, 0, 1.099 400, 0, 25, 1, 1.069 400, 25, 25, 0, 0.885 400, 25, 25, 1, 0.885 400, 2048, 0, 0, 0.948 400, 2048, 0, 1, 0.948 400, 2073, 0, 0, 0.957 400, 2073, 0, 1, 0.957 400, 2048, 25, 0, 0.94 400, 2048, 25, 1, 0.94 400, 2073, 25, 0, 0.908 400, 2073, 25, 1, 0.908 416, 0, 0, 0, 1.017 416, 0, 0, 1, 1.017 416, 26, 0, 0, 0.903 416, 26, 0, 1, 0.903 416, 0, 26, 0, 0.881 416, 0, 26, 1, 0.881 416, 26, 26, 0, 1.035 416, 26, 26, 1, 1.035 416, 2048, 0, 0, 1.017 416, 2048, 0, 1, 1.017 416, 2074, 0, 0, 0.903 416, 2074, 0, 1, 0.903 416, 2048, 26, 0, 0.881 416, 2048, 26, 1, 0.881 416, 2074, 26, 0, 1.034 416, 2074, 26, 1, 1.035 432, 0, 0, 0, 1.0 432, 0, 0, 1, 1.0 432, 27, 0, 0, 0.933 432, 27, 0, 1, 0.933 432, 0, 27, 0, 0.941 432, 0, 27, 1, 0.941 432, 27, 27, 0, 0.953 432, 27, 27, 1, 0.954 432, 2048, 0, 0, 1.0 432, 2048, 0, 1, 1.0 432, 2075, 0, 0, 0.933 432, 2075, 0, 1, 0.933 432, 2048, 27, 0, 0.941 432, 2048, 27, 1, 0.941 432, 2075, 27, 0, 0.93 432, 2075, 27, 1, 0.93 448, 0, 0, 0, 0.984 448, 0, 0, 1, 0.984 448, 28, 0, 0, 0.896 448, 28, 0, 1, 0.896 448, 0, 28, 0, 1.244 448, 0, 28, 1, 1.244 448, 28, 28, 0, 1.333 448, 28, 28, 1, 1.333 448, 2048, 0, 0, 0.984 448, 2048, 0, 1, 0.984 448, 2076, 0, 0, 0.896 448, 2076, 0, 1, 0.896 448, 2048, 28, 0, 0.988 448, 2048, 28, 1, 0.988 448, 2076, 28, 0, 1.333 448, 2076, 28, 1, 1.333 464, 0, 0, 0, 1.083 464, 0, 0, 1, 1.083 464, 29, 0, 0, 0.978 464, 29, 0, 1, 0.978 464, 0, 29, 0, 0.924 464, 0, 29, 1, 0.924 464, 29, 29, 0, 0.901 464, 29, 29, 1, 0.901 464, 2048, 0, 0, 1.083 464, 2048, 0, 1, 1.083 464, 2077, 0, 0, 0.978 464, 2077, 0, 1, 0.978 464, 2048, 29, 0, 0.924 464, 2048, 29, 1, 0.924 464, 2077, 29, 0, 0.89 464, 2077, 29, 1, 0.89 480, 0, 0, 0, 1.066 480, 0, 0, 1, 1.066 480, 30, 0, 0, 0.9 480, 30, 0, 1, 0.9 480, 0, 30, 0, 0.88 480, 0, 30, 1, 0.88 480, 30, 30, 0, 1.083 480, 30, 30, 1, 1.083 480, 2048, 0, 0, 1.066 480, 2048, 0, 1, 1.066 480, 2078, 0, 0, 0.9 480, 2078, 0, 1, 0.9 480, 2048, 30, 0, 0.88 480, 2048, 30, 1, 0.88 480, 2078, 30, 0, 1.083 480, 2078, 30, 1, 1.083 496, 0, 0, 0, 1.032 496, 0, 0, 1, 1.032 496, 31, 0, 0, 0.95 496, 31, 0, 1, 0.95 496, 0, 31, 0, 1.011 496, 0, 31, 1, 1.011 496, 31, 31, 0, 0.973 496, 31, 31, 1, 0.973 496, 2048, 0, 0, 1.032 496, 2048, 0, 1, 1.032 496, 2079, 0, 0, 0.95 496, 2079, 0, 1, 0.95 496, 2048, 31, 0, 1.011 496, 2048, 31, 1, 1.011 496, 2079, 31, 0, 0.941 496, 2079, 31, 1, 0.941 1024, 32, 0, 0, 1.143 1024, 32, 0, 1, 1.143 1024, 0, 32, 0, 1.143 1024, 0, 32, 1, 1.143 1024, 32, 32, 0, 1.143 1024, 32, 32, 1, 1.143 1024, 2080, 0, 0, 1.143 1024, 2080, 0, 1, 1.143 1024, 2048, 32, 0, 1.143 1024, 2048, 32, 1, 1.143 1024, 2080, 32, 0, 1.143 1024, 2080, 32, 1, 1.143 1056, 0, 0, 0, 1.168 1056, 0, 0, 1, 1.168 1056, 33, 0, 0, 1.067 1056, 33, 0, 1, 1.067 1056, 0, 33, 0, 0.977 1056, 0, 33, 1, 0.977 1056, 33, 33, 0, 1.043 1056, 33, 33, 1, 1.043 1056, 2048, 0, 0, 1.168 1056, 2048, 0, 1, 1.168 1056, 2081, 0, 0, 1.067 1056, 2081, 0, 1, 1.067 1056, 2048, 33, 0, 0.977 1056, 2048, 33, 1, 0.977 1056, 2081, 33, 0, 1.0 1056, 2081, 33, 1, 1.0 1088, 0, 0, 0, 1.171 1088, 0, 0, 1, 1.171 1088, 34, 0, 0, 1.041 1088, 34, 0, 1, 1.041 1088, 0, 34, 0, 1.079 1088, 0, 34, 1, 1.079 1088, 34, 34, 0, 0.966 1088, 34, 34, 1, 0.966 1088, 2048, 0, 0, 1.171 1088, 2048, 0, 1, 1.171 1088, 2082, 0, 0, 1.041 1088, 2082, 0, 1, 1.041 1088, 2048, 34, 0, 0.994 1088, 2048, 34, 1, 0.994 1088, 2082, 34, 0, 0.966 1088, 2082, 34, 1, 0.966 1120, 0, 0, 0, 1.152 1120, 0, 0, 1, 1.153 1120, 35, 0, 0, 1.051 1120, 35, 0, 1, 1.051 1120, 0, 35, 0, 1.0 1120, 0, 35, 1, 1.0 1120, 35, 35, 0, 1.068 1120, 35, 35, 1, 1.068 1120, 2048, 0, 0, 1.151 1120, 2048, 0, 1, 1.151 1120, 2083, 0, 0, 1.051 1120, 2083, 0, 1, 1.051 1120, 2048, 35, 0, 1.0 1120, 2048, 35, 1, 1.0 1120, 2083, 35, 0, 1.027 1120, 2083, 35, 1, 1.027 1152, 0, 0, 0, 1.159 1152, 0, 0, 1, 1.159 1152, 36, 0, 0, 1.034 1152, 36, 0, 1, 1.034 1152, 0, 36, 0, 1.07 1152, 0, 36, 1, 1.07 1152, 36, 36, 0, 0.967 1152, 36, 36, 1, 0.967 1152, 2048, 0, 0, 1.159 1152, 2048, 0, 1, 1.159 1152, 2084, 0, 0, 1.034 1152, 2084, 0, 1, 1.034 1152, 2048, 36, 0, 0.984 1152, 2048, 36, 1, 0.984 1152, 2084, 36, 0, 0.967 1152, 2084, 36, 1, 0.967 1184, 0, 0, 0, 1.157 1184, 0, 0, 1, 1.157 1184, 37, 0, 0, 1.067 1184, 37, 0, 1, 1.066 1184, 0, 37, 0, 0.993 1184, 0, 37, 1, 0.993 1184, 37, 37, 0, 1.08 1184, 37, 37, 1, 1.081 1184, 2048, 0, 0, 1.157 1184, 2048, 0, 1, 1.157 1184, 2085, 0, 0, 1.066 1184, 2085, 0, 1, 1.066 1184, 2048, 37, 0, 0.993 1184, 2048, 37, 1, 0.993 1184, 2085, 37, 0, 1.04 1184, 2085, 37, 1, 1.04 1216, 0, 0, 0, 1.139 1216, 0, 0, 1, 1.139 1216, 38, 0, 0, 1.024 1216, 38, 0, 1, 1.024 1216, 0, 38, 0, 1.087 1216, 0, 38, 1, 1.087 1216, 38, 38, 0, 1.0 1216, 38, 38, 1, 1.0 1216, 2048, 0, 0, 1.138 1216, 2048, 0, 1, 1.138 1216, 2086, 0, 0, 1.024 1216, 2086, 0, 1, 1.024 1216, 2048, 38, 0, 1.01 1216, 2048, 38, 1, 1.01 1216, 2086, 38, 0, 1.0 1216, 2086, 38, 1, 1.0 1248, 0, 0, 0, 1.176 1248, 0, 0, 1, 1.174 1248, 39, 0, 0, 1.074 1248, 39, 0, 1, 1.074 1248, 0, 39, 0, 0.966 1248, 0, 39, 1, 0.985 1248, 39, 39, 0, 1.064 1248, 39, 39, 1, 1.064 1248, 2048, 0, 0, 1.179 1248, 2048, 0, 1, 1.179 1248, 2087, 0, 0, 1.074 1248, 2087, 0, 1, 1.074 1248, 2048, 39, 0, 0.985 1248, 2048, 39, 1, 0.985 1248, 2087, 39, 0, 1.026 1248, 2087, 39, 1, 1.026 1280, 0, 0, 0, 0.993 1280, 0, 0, 1, 0.993 1280, 40, 0, 0, 1.051 1280, 40, 0, 1, 1.051 1280, 0, 40, 0, 1.044 1280, 0, 40, 1, 1.045 1280, 40, 40, 0, 1.25 1280, 40, 40, 1, 1.25 1280, 2048, 0, 0, 0.992 1280, 2048, 0, 1, 0.992 1280, 2088, 0, 0, 1.051 1280, 2088, 0, 1, 1.051 1280, 2048, 40, 0, 0.946 1280, 2048, 40, 1, 0.946 1280, 2088, 40, 0, 1.252 1280, 2088, 40, 1, 1.252 1312, 0, 0, 0, 0.969 1312, 0, 0, 1, 0.969 1312, 41, 0, 0, 0.991 1312, 41, 0, 1, 0.991 1312, 0, 41, 0, 0.837 1312, 0, 41, 1, 0.837 1312, 41, 41, 0, 1.025 1312, 41, 41, 1, 1.025 1312, 2048, 0, 0, 0.969 1312, 2048, 0, 1, 0.969 1312, 2089, 0, 0, 0.991 1312, 2089, 0, 1, 0.99 1312, 2048, 41, 0, 0.837 1312, 2048, 41, 1, 0.837 1312, 2089, 41, 0, 0.975 1312, 2089, 41, 1, 0.975 1344, 0, 0, 0, 0.988 1344, 0, 0, 1, 0.988 1344, 42, 0, 0, 1.031 1344, 42, 0, 1, 1.031 1344, 0, 42, 0, 1.033 1344, 0, 42, 1, 1.033 1344, 42, 42, 0, 0.982 1344, 42, 42, 1, 0.982 1344, 2048, 0, 0, 0.992 1344, 2048, 0, 1, 0.992 1344, 2090, 0, 0, 1.031 1344, 2090, 0, 1, 1.031 1344, 2048, 42, 0, 0.943 1344, 2048, 42, 1, 0.942 1344, 2090, 42, 0, 0.982 1344, 2090, 42, 1, 0.982 1376, 0, 0, 0, 1.016 1376, 0, 0, 1, 1.016 1376, 43, 0, 0, 1.01 1376, 43, 0, 1, 1.01 1376, 0, 43, 0, 0.829 1376, 0, 43, 1, 0.829 1376, 43, 43, 0, 1.024 1376, 43, 43, 1, 1.024 1376, 2048, 0, 0, 1.006 1376, 2048, 0, 1, 1.015 1376, 2091, 0, 0, 1.01 1376, 2091, 0, 1, 1.01 1376, 2048, 43, 0, 0.829 1376, 2048, 43, 1, 0.829 1376, 2091, 43, 0, 0.98 1376, 2091, 43, 1, 0.98 1408, 0, 0, 0, 0.987 1408, 0, 0, 1, 0.987 1408, 44, 0, 0, 1.015 1408, 44, 0, 1, 1.015 1408, 0, 44, 0, 1.018 1408, 0, 44, 1, 1.014 1408, 44, 44, 0, 1.004 1408, 44, 44, 1, 0.994 1408, 2048, 0, 0, 0.988 1408, 2048, 0, 1, 0.988 1408, 2092, 0, 0, 1.015 1408, 2092, 0, 1, 1.015 1408, 2048, 44, 0, 0.955 1408, 2048, 44, 1, 0.955 1408, 2092, 44, 0, 1.0 1408, 2092, 44, 1, 0.994 1440, 0, 0, 0, 0.986 1440, 0, 0, 1, 0.986 1440, 45, 0, 0, 1.013 1440, 45, 0, 1, 1.013 1440, 0, 45, 0, 0.814 1440, 0, 45, 1, 0.814 1440, 45, 45, 0, 1.006 1440, 45, 45, 1, 1.006 1440, 2048, 0, 0, 0.986 1440, 2048, 0, 1, 0.986 1440, 2093, 0, 0, 1.013 1440, 2093, 0, 1, 1.013 1440, 2048, 45, 0, 0.814 1440, 2048, 45, 1, 0.814 1440, 2093, 45, 0, 0.966 1440, 2093, 45, 1, 0.966 1472, 0, 0, 0, 0.997 1472, 0, 0, 1, 0.994 1472, 46, 0, 0, 1.045 1472, 46, 0, 1, 1.045 1472, 0, 46, 0, 1.026 1472, 0, 46, 1, 1.026 1472, 46, 46, 0, 0.966 1472, 46, 46, 1, 0.966 1472, 2048, 0, 0, 1.0 1472, 2048, 0, 1, 0.996 1472, 2094, 0, 0, 1.045 1472, 2094, 0, 1, 1.045 1472, 2048, 46, 0, 0.939 1472, 2048, 46, 1, 0.939 1472, 2094, 46, 0, 0.966 1472, 2094, 46, 1, 0.966 1504, 0, 0, 0, 0.993 1504, 0, 0, 1, 0.993 1504, 47, 0, 0, 0.999 1504, 47, 0, 1, 0.999 1504, 0, 47, 0, 0.826 1504, 0, 47, 1, 0.826 1504, 47, 47, 0, 1.023 1504, 47, 47, 1, 1.023 1504, 2048, 0, 0, 0.993 1504, 2048, 0, 1, 0.993 1504, 2095, 0, 0, 0.999 1504, 2095, 0, 1, 0.999 1504, 2048, 47, 0, 0.826 1504, 2048, 47, 1, 0.826 1504, 2095, 47, 0, 0.993 1504, 2095, 47, 1, 0.993 1536, 0, 0, 0, 0.992 1536, 0, 0, 1, 0.991 1536, 48, 0, 0, 1.019 1536, 48, 0, 1, 1.019 1536, 0, 48, 0, 1.025 1536, 0, 48, 1, 1.024 1536, 48, 48, 0, 0.994 1536, 48, 48, 1, 0.994 1536, 2048, 0, 0, 0.994 1536, 2048, 0, 1, 0.994 1536, 2096, 0, 0, 1.019 1536, 2096, 0, 1, 1.019 1536, 2048, 48, 0, 1.025 1536, 2048, 48, 1, 1.025 1536, 2096, 48, 0, 0.994 1536, 2096, 48, 1, 0.994 1568, 0, 0, 0, 0.994 1568, 0, 0, 1, 0.994 1568, 49, 0, 0, 0.903 1568, 49, 0, 1, 0.903 1568, 0, 49, 0, 1.144 1568, 0, 49, 1, 1.144 1568, 49, 49, 0, 1.461 1568, 49, 49, 1, 1.461 1568, 2048, 0, 0, 0.993 1568, 2048, 0, 1, 0.993 1568, 2097, 0, 0, 0.903 1568, 2097, 0, 1, 0.903 1568, 2048, 49, 0, 1.09 1568, 2048, 49, 1, 1.09 1568, 2097, 49, 0, 1.46 1568, 2097, 49, 1, 1.46 1600, 0, 0, 0, 0.981 1600, 0, 0, 1, 0.981 1600, 50, 0, 0, 1.022 1600, 50, 0, 1, 1.022 1600, 0, 50, 0, 1.017 1600, 0, 50, 1, 1.017 1600, 50, 50, 0, 0.973 1600, 50, 50, 1, 0.973 1600, 2048, 0, 0, 0.981 1600, 2048, 0, 1, 0.981 1600, 2098, 0, 0, 1.022 1600, 2098, 0, 1, 1.022 1600, 2048, 50, 0, 0.961 1600, 2048, 50, 1, 0.961 1600, 2098, 50, 0, 0.973 1600, 2098, 50, 1, 0.973 1632, 0, 0, 0, 1.019 1632, 0, 0, 1, 1.019 1632, 51, 0, 0, 0.893 1632, 51, 0, 1, 0.893 1632, 0, 51, 0, 1.131 1632, 0, 51, 1, 1.131 1632, 51, 51, 0, 1.444 1632, 51, 51, 1, 1.444 1632, 2048, 0, 0, 1.019 1632, 2048, 0, 1, 1.019 1632, 2099, 0, 0, 0.893 1632, 2099, 0, 1, 0.893 1632, 2048, 51, 0, 1.079 1632, 2048, 51, 1, 1.079 1632, 2099, 51, 0, 1.449 1632, 2099, 51, 1, 1.449 1664, 0, 0, 0, 1.005 1664, 0, 0, 1, 1.004 1664, 52, 0, 0, 0.986 1664, 52, 0, 1, 0.986 1664, 0, 52, 0, 1.004 1664, 0, 52, 1, 1.004 1664, 52, 52, 0, 0.976 1664, 52, 52, 1, 0.976 1664, 2048, 0, 0, 1.006 1664, 2048, 0, 1, 1.006 1664, 2100, 0, 0, 0.993 1664, 2100, 0, 1, 0.993 1664, 2048, 52, 0, 0.946 1664, 2048, 52, 1, 0.946 1664, 2100, 52, 0, 0.976 1664, 2100, 52, 1, 0.976 1696, 0, 0, 0, 0.994 1696, 0, 0, 1, 0.992 1696, 53, 0, 0, 0.884 1696, 53, 0, 1, 0.884 1696, 0, 53, 0, 1.141 1696, 0, 53, 1, 1.141 1696, 53, 53, 0, 1.43 1696, 53, 53, 1, 1.43 1696, 2048, 0, 0, 0.994 1696, 2048, 0, 1, 0.994 1696, 2101, 0, 0, 0.884 1696, 2101, 0, 1, 0.884 1696, 2048, 53, 0, 1.088 1696, 2048, 53, 1, 1.088 1696, 2101, 53, 0, 1.429 1696, 2101, 53, 1, 1.429 1728, 0, 0, 0, 0.978 1728, 0, 0, 1, 0.978 1728, 54, 0, 0, 1.031 1728, 54, 0, 1, 1.033 1728, 0, 54, 0, 1.0 1728, 0, 54, 1, 1.0 1728, 54, 54, 0, 0.96 1728, 54, 54, 1, 0.96 1728, 2048, 0, 0, 0.976 1728, 2048, 0, 1, 0.976 1728, 2102, 0, 0, 1.033 1728, 2102, 0, 1, 1.033 1728, 2048, 54, 0, 0.947 1728, 2048, 54, 1, 0.947 1728, 2102, 54, 0, 0.96 1728, 2102, 54, 1, 0.96 1760, 0, 0, 0, 1.019 1760, 0, 0, 1, 1.021 1760, 55, 0, 0, 0.9 1760, 55, 0, 1, 0.9 1760, 0, 55, 0, 1.125 1760, 0, 55, 1, 1.125 1760, 55, 55, 0, 1.437 1760, 55, 55, 1, 1.436 1760, 2048, 0, 0, 1.016 1760, 2048, 0, 1, 1.015 1760, 2103, 0, 0, 0.9 1760, 2103, 0, 1, 0.9 1760, 2048, 55, 0, 1.073 1760, 2048, 55, 1, 1.074 1760, 2103, 55, 0, 1.44 1760, 2103, 55, 1, 1.44 1792, 0, 0, 0, 1.002 1792, 0, 0, 1, 1.002 1792, 56, 0, 0, 1.028 1792, 56, 0, 1, 1.028 1792, 0, 56, 0, 1.014 1792, 0, 56, 1, 1.015 1792, 56, 56, 0, 1.191 1792, 56, 56, 1, 1.191 1792, 2048, 0, 0, 1.003 1792, 2048, 0, 1, 1.003 1792, 2104, 0, 0, 1.028 1792, 2104, 0, 1, 1.028 1792, 2048, 56, 0, 0.963 1792, 2048, 56, 1, 0.963 1792, 2104, 56, 0, 1.191 1792, 2104, 56, 1, 1.191 1824, 0, 0, 0, 0.999 1824, 0, 0, 1, 1.0 1824, 57, 0, 0, 0.891 1824, 57, 0, 1, 0.891 1824, 0, 57, 0, 1.114 1824, 0, 57, 1, 1.114 1824, 57, 57, 0, 1.407 1824, 57, 57, 1, 1.407 1824, 2048, 0, 0, 1.001 1824, 2048, 0, 1, 1.001 1824, 2105, 0, 0, 0.891 1824, 2105, 0, 1, 0.891 1824, 2048, 57, 0, 1.064 1824, 2048, 57, 1, 1.064 1824, 2105, 57, 0, 1.407 1824, 2105, 57, 1, 1.407 1856, 0, 0, 0, 0.989 1856, 0, 0, 1, 0.987 1856, 58, 0, 0, 1.042 1856, 58, 0, 1, 1.042 1856, 0, 58, 0, 1.007 1856, 0, 58, 1, 1.007 1856, 58, 58, 0, 0.978 1856, 58, 58, 1, 0.972 1856, 2048, 0, 0, 0.992 1856, 2048, 0, 1, 0.992 1856, 2106, 0, 0, 1.042 1856, 2106, 0, 1, 1.042 1856, 2048, 58, 0, 0.954 1856, 2048, 58, 1, 0.954 1856, 2106, 58, 0, 0.979 1856, 2106, 58, 1, 0.972 1888, 0, 0, 0, 0.994 1888, 0, 0, 1, 0.994 1888, 59, 0, 0, 0.883 1888, 59, 0, 1, 0.883 1888, 0, 59, 0, 1.121 1888, 0, 59, 1, 1.123 1888, 59, 59, 0, 1.413 1888, 59, 59, 1, 1.413 1888, 2048, 0, 0, 0.985 1888, 2048, 0, 1, 0.994 1888, 2107, 0, 0, 0.883 1888, 2107, 0, 1, 0.883 1888, 2048, 59, 0, 1.076 1888, 2048, 59, 1, 1.076 1888, 2107, 59, 0, 1.413 1888, 2107, 59, 1, 1.413 1920, 0, 0, 0, 1.0 1920, 0, 0, 1, 0.999 1920, 60, 0, 0, 1.033 1920, 60, 0, 1, 1.033 1920, 0, 60, 0, 0.996 1920, 0, 60, 1, 0.997 1920, 60, 60, 0, 0.968 1920, 60, 60, 1, 0.968 1920, 2048, 0, 0, 1.0 1920, 2048, 0, 1, 1.0 1920, 2108, 0, 0, 1.034 1920, 2108, 0, 1, 1.034 1920, 2048, 60, 0, 0.949 1920, 2048, 60, 1, 0.949 1920, 2108, 60, 0, 0.968 1920, 2108, 60, 1, 0.968 1952, 0, 0, 0, 1.004 1952, 0, 0, 1, 1.004 1952, 61, 0, 0, 0.898 1952, 61, 0, 1, 0.898 1952, 0, 61, 0, 1.118 1952, 0, 61, 1, 1.118 1952, 61, 61, 0, 1.387 1952, 61, 61, 1, 1.387 1952, 2048, 0, 0, 1.004 1952, 2048, 0, 1, 1.004 1952, 2109, 0, 0, 0.898 1952, 2109, 0, 1, 0.898 1952, 2048, 61, 0, 1.071 1952, 2048, 61, 1, 1.071 1952, 2109, 61, 0, 1.387 1952, 2109, 61, 1, 1.387 1984, 0, 0, 0, 0.993 1984, 0, 0, 1, 0.993 1984, 62, 0, 0, 1.025 1984, 62, 0, 1, 1.025 1984, 0, 62, 0, 1.005 1984, 0, 62, 1, 1.007 1984, 62, 62, 0, 0.982 1984, 62, 62, 1, 0.982 1984, 2048, 0, 0, 0.993 1984, 2048, 0, 1, 0.993 1984, 2110, 0, 0, 1.025 1984, 2110, 0, 1, 1.025 1984, 2048, 62, 0, 0.96 1984, 2048, 62, 1, 0.96 1984, 2110, 62, 0, 0.982 1984, 2110, 62, 1, 0.982 2016, 0, 0, 0, 1.0 2016, 0, 0, 1, 0.999 2016, 63, 0, 0, 0.889 2016, 63, 0, 1, 0.89 2016, 0, 63, 0, 1.091 2016, 0, 63, 1, 1.092 2016, 63, 63, 0, 1.362 2016, 63, 63, 1, 1.363 2016, 2048, 0, 0, 1.0 2016, 2048, 0, 1, 1.0 2016, 2111, 0, 0, 0.965 2016, 2111, 0, 1, 0.965 2016, 2048, 63, 0, 1.049 2016, 2048, 63, 1, 1.049 2016, 2111, 63, 0, 1.405 2016, 2111, 63, 1, 1.405 2048, 32, 0, 0, 1.01 2048, 32, 0, 1, 1.01 2048, 0, 32, 0, 1.005 2048, 0, 32, 1, 1.005 2048, 32, 32, 0, 1.005 2048, 32, 32, 1, 1.005 2048, 0, 1, 0, 0.983 2048, 0, 1, 1, 0.984 2048, 1, 0, 0, 1.039 2048, 1, 0, 1, 1.039 2048, 32, 1, 0, 1.063 2048, 32, 1, 1, 1.063 2048, 1, 32, 0, 0.94 2048, 1, 32, 1, 0.94 2048, 2048, 1, 0, 0.981 2048, 2048, 1, 1, 0.981 2048, 2049, 0, 0, 0.904 2048, 2049, 0, 1, 0.904 2112, 0, 0, 0, 0.996 2112, 0, 0, 1, 0.995 2112, 1, 0, 0, 1.031 2112, 1, 0, 1, 1.031 2112, 33, 0, 0, 1.01 2112, 33, 0, 1, 1.01 2112, 0, 1, 0, 0.972 2112, 0, 1, 1, 0.972 2112, 0, 33, 0, 0.987 2112, 0, 33, 1, 0.987 2112, 1, 1, 0, 0.914 2112, 1, 1, 1, 0.914 2112, 33, 33, 0, 0.983 2112, 33, 33, 1, 0.983 2112, 2048, 0, 0, 0.994 2112, 2048, 0, 1, 0.99 2112, 2049, 0, 0, 1.031 2112, 2049, 0, 1, 1.031 2112, 2048, 1, 0, 0.955 2112, 2048, 1, 1, 0.955 2112, 2049, 1, 0, 0.906 2112, 2049, 1, 1, 0.906 2112, 33, 1, 0, 1.163 2112, 33, 1, 1, 1.164 2112, 1, 33, 0, 1.046 2112, 1, 33, 1, 1.046 2176, 0, 0, 0, 0.984 2176, 0, 0, 1, 0.985 2176, 2, 0, 0, 1.023 2176, 2, 0, 1, 1.023 2176, 34, 0, 0, 1.0 2176, 34, 0, 1, 1.0 2176, 0, 2, 0, 0.985 2176, 0, 2, 1, 0.985 2176, 0, 34, 0, 0.995 2176, 0, 34, 1, 0.982 2176, 2, 2, 0, 0.928 2176, 2, 2, 1, 0.928 2176, 34, 34, 0, 1.004 2176, 34, 34, 1, 1.004 2176, 2048, 0, 0, 0.985 2176, 2048, 0, 1, 0.986 2176, 2050, 0, 0, 1.023 2176, 2050, 0, 1, 1.023 2176, 2048, 2, 0, 0.802 2176, 2048, 2, 1, 0.802 2176, 2050, 2, 0, 0.894 2176, 2050, 2, 1, 0.894 2176, 2, 1, 0, 1.068 2176, 2, 1, 1, 1.068 2176, 1, 2, 0, 0.976 2176, 1, 2, 1, 0.976 2176, 34, 1, 0, 1.077 2176, 34, 1, 1, 1.077 2176, 1, 34, 0, 0.978 2176, 1, 34, 1, 0.978 2176, 2050, 1, 0, 1.061 2176, 2050, 1, 1, 1.061 2176, 2049, 2, 0, 0.971 2176, 2049, 2, 1, 0.971 2240, 0, 0, 0, 0.994 2240, 0, 0, 1, 0.994 2240, 3, 0, 0, 1.038 2240, 3, 0, 1, 1.039 2240, 35, 0, 0, 1.019 2240, 35, 0, 1, 1.019 2240, 0, 3, 0, 0.979 2240, 0, 3, 1, 0.98 2240, 0, 35, 0, 0.991 2240, 0, 35, 1, 0.991 2240, 3, 3, 0, 0.931 2240, 3, 3, 1, 0.931 2240, 35, 35, 0, 0.999 2240, 35, 35, 1, 0.999 2240, 2048, 0, 0, 0.995 2240, 2048, 0, 1, 0.995 2240, 2051, 0, 0, 1.039 2240, 2051, 0, 1, 1.039 2240, 2048, 3, 0, 0.799 2240, 2048, 3, 1, 0.799 2240, 2051, 3, 0, 0.889 2240, 2051, 3, 1, 0.889 2240, 3, 1, 0, 1.06 2240, 3, 1, 1, 1.06 2240, 1, 3, 0, 0.968 2240, 1, 3, 1, 0.968 2240, 35, 1, 0, 1.071 2240, 35, 1, 1, 1.071 2240, 1, 35, 0, 0.971 2240, 1, 35, 1, 0.971 2240, 2051, 1, 0, 1.057 2240, 2051, 1, 1, 1.057 2240, 2049, 3, 0, 0.966 2240, 2049, 3, 1, 0.966 2304, 0, 0, 0, 0.986 2304, 0, 0, 1, 0.986 2304, 4, 0, 0, 1.031 2304, 4, 0, 1, 1.032 2304, 36, 0, 0, 1.011 2304, 36, 0, 1, 1.011 2304, 0, 4, 0, 0.968 2304, 0, 4, 1, 0.969 2304, 0, 36, 0, 0.988 2304, 0, 36, 1, 0.988 2304, 4, 4, 0, 0.93 2304, 4, 4, 1, 0.931 2304, 36, 36, 0, 0.992 2304, 36, 36, 1, 0.992 2304, 2048, 0, 0, 0.988 2304, 2048, 0, 1, 0.988 2304, 2052, 0, 0, 1.032 2304, 2052, 0, 1, 1.032 2304, 2048, 4, 0, 0.793 2304, 2048, 4, 1, 0.793 2304, 2052, 4, 0, 0.884 2304, 2052, 4, 1, 0.884 2304, 4, 1, 0, 0.989 2304, 4, 1, 1, 0.989 2304, 1, 4, 0, 0.897 2304, 1, 4, 1, 0.898 2304, 36, 1, 0, 1.057 2304, 36, 1, 1, 1.057 2304, 1, 36, 0, 0.966 2304, 1, 36, 1, 0.966 2304, 2052, 1, 0, 1.052 2304, 2052, 1, 1, 1.052 2304, 2049, 4, 0, 0.955 2304, 2049, 4, 1, 0.955 2368, 0, 0, 0, 1.0 2368, 0, 0, 1, 1.001 2368, 5, 0, 0, 1.024 2368, 5, 0, 1, 1.025 2368, 37, 0, 0, 1.0 2368, 37, 0, 1, 1.0 2368, 0, 5, 0, 0.98 2368, 0, 5, 1, 0.981 2368, 0, 37, 0, 0.983 2368, 0, 37, 1, 0.98 2368, 5, 5, 0, 0.944 2368, 5, 5, 1, 0.944 2368, 37, 37, 0, 1.003 2368, 37, 37, 1, 1.003 2368, 2048, 0, 0, 1.002 2368, 2048, 0, 1, 1.002 2368, 2053, 0, 0, 1.025 2368, 2053, 0, 1, 1.025 2368, 2048, 5, 0, 0.801 2368, 2048, 5, 1, 0.801 2368, 2053, 5, 0, 0.907 2368, 2053, 5, 1, 0.907 2368, 5, 1, 0, 1.071 2368, 5, 1, 1, 1.071 2368, 1, 5, 0, 0.973 2368, 1, 5, 1, 0.973 2368, 37, 1, 0, 1.07 2368, 37, 1, 1, 1.07 2368, 1, 37, 0, 0.974 2368, 1, 37, 1, 0.974 2368, 2053, 1, 0, 1.065 2368, 2053, 1, 1, 1.065 2368, 2049, 5, 0, 0.967 2368, 2049, 5, 1, 0.967 2432, 0, 0, 0, 0.965 2432, 0, 0, 1, 1.0 2432, 6, 0, 0, 1.038 2432, 6, 0, 1, 1.039 2432, 38, 0, 0, 1.021 2432, 38, 0, 1, 1.021 2432, 0, 6, 0, 0.974 2432, 0, 6, 1, 0.976 2432, 0, 38, 0, 0.986 2432, 0, 38, 1, 0.986 2432, 6, 6, 0, 0.926 2432, 6, 6, 1, 0.926 2432, 38, 38, 0, 1.0 2432, 38, 38, 1, 1.0 2432, 2048, 0, 0, 1.004 2432, 2048, 0, 1, 1.004 2432, 2054, 0, 0, 1.039 2432, 2054, 0, 1, 1.039 2432, 2048, 6, 0, 0.797 2432, 2048, 6, 1, 0.797 2432, 2054, 6, 0, 0.898 2432, 2054, 6, 1, 0.898 2432, 6, 1, 0, 1.063 2432, 6, 1, 1, 1.063 2432, 1, 6, 0, 0.965 2432, 1, 6, 1, 0.965 2432, 38, 1, 0, 1.068 2432, 38, 1, 1, 1.068 2432, 1, 38, 0, 0.968 2432, 1, 38, 1, 0.968 2432, 2054, 1, 0, 1.06 2432, 2054, 1, 1, 1.06 2432, 2049, 6, 0, 0.963 2432, 2049, 6, 1, 0.963 2496, 0, 0, 0, 1.013 2496, 0, 0, 1, 1.013 2496, 7, 0, 0, 1.032 2496, 7, 0, 1, 1.032 2496, 39, 0, 0, 1.013 2496, 39, 0, 1, 1.013 2496, 0, 7, 0, 0.965 2496, 0, 7, 1, 0.965 2496, 0, 39, 0, 0.979 2496, 0, 39, 1, 0.979 2496, 7, 7, 0, 0.925 2496, 7, 7, 1, 0.925 2496, 39, 39, 0, 0.989 2496, 39, 39, 1, 0.989 2496, 2048, 0, 0, 1.013 2496, 2048, 0, 1, 1.013 2496, 2055, 0, 0, 1.032 2496, 2055, 0, 1, 1.032 2496, 2048, 7, 0, 0.792 2496, 2048, 7, 1, 0.792 2496, 2055, 7, 0, 0.93 2496, 2055, 7, 1, 0.93 2496, 7, 1, 0, 0.984 2496, 7, 1, 1, 0.984 2496, 1, 7, 0, 0.894 2496, 1, 7, 1, 0.895 2496, 39, 1, 0, 1.054 2496, 39, 1, 1, 1.054 2496, 1, 39, 0, 0.963 2496, 1, 39, 1, 0.963 2496, 2055, 1, 0, 1.049 2496, 2055, 1, 1, 1.049 2496, 2049, 7, 0, 0.953 2496, 2049, 7, 1, 0.953 2560, 0, 0, 0, 0.991 2560, 0, 0, 1, 0.991 2560, 8, 0, 0, 1.031 2560, 8, 0, 1, 1.032 2560, 40, 0, 0, 1.029 2560, 40, 0, 1, 1.029 2560, 0, 8, 0, 0.992 2560, 0, 8, 1, 0.992 2560, 0, 40, 0, 0.975 2560, 0, 40, 1, 0.984 2560, 8, 8, 0, 0.942 2560, 8, 8, 1, 0.943 2560, 40, 40, 0, 1.139 2560, 40, 40, 1, 1.139 2560, 2048, 0, 0, 0.993 2560, 2048, 0, 1, 0.993 2560, 2056, 0, 0, 1.032 2560, 2056, 0, 1, 1.032 2560, 2048, 8, 0, 0.812 2560, 2048, 8, 1, 0.812 2560, 2056, 8, 0, 0.912 2560, 2056, 8, 1, 0.912 2560, 8, 1, 0, 1.068 2560, 8, 1, 1, 1.069 2560, 1, 8, 0, 0.974 2560, 1, 8, 1, 0.974 2560, 40, 1, 0, 1.068 2560, 40, 1, 1, 1.068 2560, 1, 40, 0, 0.996 2560, 1, 40, 1, 0.996 2560, 2056, 1, 0, 1.063 2560, 2056, 1, 1, 1.063 2560, 2049, 8, 0, 0.969 2560, 2049, 8, 1, 0.969 2624, 0, 0, 0, 0.995 2624, 0, 0, 1, 0.994 2624, 9, 0, 0, 1.015 2624, 9, 0, 1, 1.018 2624, 41, 0, 0, 1.044 2624, 41, 0, 1, 1.044 2624, 0, 9, 0, 0.988 2624, 0, 9, 1, 0.99 2624, 0, 41, 0, 0.989 2624, 0, 41, 1, 0.99 2624, 9, 9, 0, 0.943 2624, 9, 9, 1, 0.943 2624, 41, 41, 0, 0.993 2624, 41, 41, 1, 0.993 2624, 2048, 0, 0, 0.998 2624, 2048, 0, 1, 0.998 2624, 2057, 0, 0, 1.018 2624, 2057, 0, 1, 1.018 2624, 2048, 9, 0, 0.81 2624, 2048, 9, 1, 0.81 2624, 2057, 9, 0, 0.907 2624, 2057, 9, 1, 0.907 2624, 9, 1, 0, 1.09 2624, 9, 1, 1, 1.09 2624, 1, 9, 0, 0.967 2624, 1, 9, 1, 0.967 2624, 41, 1, 0, 1.084 2624, 41, 1, 1, 1.085 2624, 1, 41, 0, 0.958 2624, 1, 41, 1, 0.957 2624, 2057, 1, 0, 1.087 2624, 2057, 1, 1, 1.087 2624, 2049, 9, 0, 0.965 2624, 2049, 9, 1, 0.965 2688, 0, 0, 0, 0.995 2688, 0, 0, 1, 0.995 2688, 10, 0, 0, 1.01 2688, 10, 0, 1, 1.012 2688, 42, 0, 0, 1.036 2688, 42, 0, 1, 1.036 2688, 0, 10, 0, 0.978 2688, 0, 10, 1, 0.979 2688, 0, 42, 0, 0.977 2688, 0, 42, 1, 0.978 2688, 10, 10, 0, 0.942 2688, 10, 10, 1, 0.942 2688, 42, 42, 0, 0.989 2688, 42, 42, 1, 0.989 2688, 2048, 0, 0, 0.995 2688, 2048, 0, 1, 0.995 2688, 2058, 0, 0, 1.012 2688, 2058, 0, 1, 1.012 2688, 2048, 10, 0, 0.804 2688, 2048, 10, 1, 0.804 2688, 2058, 10, 0, 0.905 2688, 2058, 10, 1, 0.905 2688, 10, 1, 0, 0.986 2688, 10, 1, 1, 0.987 2688, 1, 10, 0, 0.893 2688, 1, 10, 1, 0.894 2688, 42, 1, 0, 1.054 2688, 42, 1, 1, 1.054 2688, 1, 42, 0, 0.958 2688, 1, 42, 1, 0.958 2688, 2058, 1, 0, 1.052 2688, 2058, 1, 1, 1.052 2688, 2049, 10, 0, 0.954 2688, 2049, 10, 1, 0.954 2752, 0, 0, 0, 1.0 2752, 0, 0, 1, 0.992 2752, 11, 0, 0, 0.954 2752, 11, 0, 1, 0.954 2752, 43, 0, 0, 0.979 2752, 43, 0, 1, 0.979 2752, 0, 11, 0, 0.939 2752, 0, 11, 1, 0.939 2752, 0, 43, 0, 0.931 2752, 0, 43, 1, 0.932 2752, 11, 11, 0, 0.949 2752, 11, 11, 1, 0.949 2752, 43, 43, 0, 1.007 2752, 43, 43, 1, 1.007 2752, 2048, 0, 0, 0.993 2752, 2048, 0, 1, 0.993 2752, 2059, 0, 0, 0.954 2752, 2059, 0, 1, 0.954 2752, 2048, 11, 0, 0.77 2752, 2048, 11, 1, 0.77 2752, 2059, 11, 0, 0.916 2752, 2059, 11, 1, 0.916 2752, 11, 1, 0, 0.994 2752, 11, 1, 1, 0.994 2752, 1, 11, 0, 0.928 2752, 1, 11, 1, 0.928 2752, 43, 1, 0, 1.022 2752, 43, 1, 1, 1.022 2752, 1, 43, 0, 0.92 2752, 1, 43, 1, 0.92 2752, 2059, 1, 0, 0.989 2752, 2059, 1, 1, 0.989 2752, 2049, 11, 0, 0.923 2752, 2049, 11, 1, 0.923 2816, 0, 0, 0, 1.003 2816, 0, 0, 1, 1.003 2816, 12, 0, 0, 0.897 2816, 12, 0, 1, 0.894 2816, 44, 0, 0, 0.914 2816, 44, 0, 1, 0.914 2816, 0, 12, 0, 0.876 2816, 0, 12, 1, 0.874 2816, 0, 44, 0, 0.871 2816, 0, 44, 1, 0.87 2816, 12, 12, 0, 0.948 2816, 12, 12, 1, 0.948 2816, 44, 44, 0, 1.009 2816, 44, 44, 1, 1.009 2816, 2048, 0, 0, 1.005 2816, 2048, 0, 1, 1.005 2816, 2060, 0, 0, 0.894 2816, 2060, 0, 1, 0.894 2816, 2048, 12, 0, 0.714 2816, 2048, 12, 1, 0.713 2816, 2060, 12, 0, 0.915 2816, 2060, 12, 1, 0.915 2816, 12, 1, 0, 0.917 2816, 12, 1, 1, 0.917 2816, 1, 12, 0, 0.858 2816, 1, 12, 1, 0.857 2816, 44, 1, 0, 0.944 2816, 44, 1, 1, 0.943 2816, 1, 44, 0, 0.856 2816, 1, 44, 1, 0.856 2816, 2060, 1, 0, 0.914 2816, 2060, 1, 1, 0.914 2816, 2049, 12, 0, 0.855 2816, 2049, 12, 1, 0.855 2880, 0, 0, 0, 0.989 2880, 0, 0, 1, 0.989 2880, 13, 0, 0, 0.967 2880, 13, 0, 1, 0.967 2880, 45, 0, 0, 0.987 2880, 45, 0, 1, 0.987 2880, 0, 13, 0, 0.925 2880, 0, 13, 1, 0.925 2880, 0, 45, 0, 0.927 2880, 0, 45, 1, 0.927 2880, 13, 13, 0, 0.944 2880, 13, 13, 1, 0.944 2880, 45, 45, 0, 1.003 2880, 45, 45, 1, 1.003 2880, 2048, 0, 0, 0.989 2880, 2048, 0, 1, 0.989 2880, 2061, 0, 0, 0.967 2880, 2061, 0, 1, 0.967 2880, 2048, 13, 0, 0.76 2880, 2048, 13, 1, 0.76 2880, 2061, 13, 0, 0.91 2880, 2061, 13, 1, 0.91 2880, 13, 1, 0, 0.922 2880, 13, 1, 1, 0.922 2880, 1, 13, 0, 0.859 2880, 1, 13, 1, 0.859 2880, 45, 1, 0, 1.013 2880, 45, 1, 1, 1.013 2880, 1, 45, 0, 0.92 2880, 1, 45, 1, 0.92 2880, 2061, 1, 0, 0.984 2880, 2061, 1, 1, 0.984 2880, 2049, 13, 0, 0.918 2880, 2049, 13, 1, 0.918 2944, 0, 0, 0, 1.014 2944, 0, 0, 1, 1.014 2944, 14, 0, 0, 0.956 2944, 14, 0, 1, 0.955 2944, 46, 0, 0, 0.979 2944, 46, 0, 1, 0.979 2944, 0, 14, 0, 0.937 2944, 0, 14, 1, 0.937 2944, 0, 46, 0, 0.93 2944, 0, 46, 1, 0.93 2944, 14, 14, 0, 0.953 2944, 14, 14, 1, 0.953 2944, 46, 46, 0, 1.009 2944, 46, 46, 1, 1.009 2944, 2048, 0, 0, 1.015 2944, 2048, 0, 1, 1.015 2944, 2062, 0, 0, 0.955 2944, 2062, 0, 1, 0.955 2944, 2048, 14, 0, 0.769 2944, 2048, 14, 1, 0.769 2944, 2062, 14, 0, 0.923 2944, 2062, 14, 1, 0.923 2944, 14, 1, 0, 0.994 2944, 14, 1, 1, 0.994 2944, 1, 14, 0, 0.927 2944, 1, 14, 1, 0.927 2944, 46, 1, 0, 1.021 2944, 46, 1, 1, 1.021 2944, 1, 46, 0, 0.923 2944, 1, 46, 1, 0.923 2944, 2062, 1, 0, 0.988 2944, 2062, 1, 1, 0.988 2944, 2049, 14, 0, 0.922 2944, 2049, 14, 1, 0.922 3008, 0, 0, 0, 0.994 3008, 0, 0, 1, 0.994 3008, 15, 0, 0, 0.941 3008, 15, 0, 1, 0.941 3008, 47, 0, 0, 0.996 3008, 47, 0, 1, 0.996 3008, 0, 15, 0, 0.929 3008, 0, 15, 1, 0.933 3008, 0, 47, 0, 0.933 3008, 0, 47, 1, 0.933 3008, 15, 15, 0, 0.952 3008, 15, 15, 1, 0.949 3008, 47, 47, 0, 1.003 3008, 47, 47, 1, 1.003 3008, 2048, 0, 0, 0.998 3008, 2048, 0, 1, 0.998 3008, 2063, 0, 0, 0.941 3008, 2063, 0, 1, 0.941 3008, 2048, 15, 0, 0.766 3008, 2048, 15, 1, 0.766 3008, 2063, 15, 0, 0.916 3008, 2063, 15, 1, 0.916 3008, 15, 1, 0, 0.985 3008, 15, 1, 1, 0.985 3008, 1, 15, 0, 0.916 3008, 1, 15, 1, 0.916 3008, 47, 1, 0, 1.014 3008, 47, 1, 1, 1.014 3008, 1, 47, 0, 0.902 3008, 1, 47, 1, 0.902 3008, 2063, 1, 0, 0.981 3008, 2063, 1, 1, 0.981 3008, 2049, 15, 0, 0.912 3008, 2049, 15, 1, 0.913 3072, 0, 0, 0, 1.016 3072, 0, 0, 1, 1.015 3072, 16, 0, 0, 1.045 3072, 16, 0, 1, 1.045 3072, 48, 0, 0, 1.045 3072, 48, 0, 1, 1.045 3072, 0, 16, 0, 1.049 3072, 0, 16, 1, 1.049 3072, 0, 48, 0, 1.049 3072, 0, 48, 1, 1.049 3072, 16, 16, 0, 1.016 3072, 16, 16, 1, 1.016 3072, 48, 48, 0, 1.016 3072, 48, 48, 1, 1.016 3072, 2048, 0, 0, 1.016 3072, 2048, 0, 1, 1.016 3072, 2064, 0, 0, 1.045 3072, 2064, 0, 1, 1.045 3072, 2048, 16, 0, 1.049 3072, 2048, 16, 1, 1.049 3072, 2064, 16, 0, 1.016 3072, 2064, 16, 1, 1.016 3072, 16, 1, 0, 0.815 3072, 16, 1, 1, 0.815 3072, 1, 16, 0, 0.872 3072, 1, 16, 1, 0.872 3072, 48, 1, 0, 1.017 3072, 48, 1, 1, 1.017 3072, 1, 48, 0, 0.872 3072, 1, 48, 1, 0.872 3072, 2064, 1, 0, 0.815 3072, 2064, 1, 1, 0.815 3072, 2049, 16, 0, 0.872 3072, 2049, 16, 1, 0.872 3136, 0, 0, 0, 0.995 3136, 0, 0, 1, 0.995 3136, 17, 0, 0, 0.949 3136, 17, 0, 1, 0.949 3136, 49, 0, 0, 0.987 3136, 49, 0, 1, 0.987 3136, 0, 17, 0, 0.919 3136, 0, 17, 1, 0.917 3136, 0, 49, 0, 0.931 3136, 0, 49, 1, 0.931 3136, 17, 17, 0, 1.122 3136, 17, 17, 1, 1.119 3136, 49, 49, 0, 0.987 3136, 49, 49, 1, 0.987 3136, 2048, 0, 0, 0.997 3136, 2048, 0, 1, 0.997 3136, 2065, 0, 0, 0.949 3136, 2065, 0, 1, 0.949 3136, 2048, 17, 0, 0.896 3136, 2048, 17, 1, 0.896 3136, 2065, 17, 0, 1.122 3136, 2065, 17, 1, 1.119 3136, 17, 1, 0, 1.184 3136, 17, 1, 1, 1.184 3136, 1, 17, 0, 1.124 3136, 1, 17, 1, 1.125 3136, 49, 1, 0, 1.11 3136, 49, 1, 1, 1.108 3136, 1, 49, 0, 1.044 3136, 1, 49, 1, 1.044 3136, 2065, 1, 0, 1.147 3136, 2065, 1, 1, 1.147 3136, 2049, 17, 0, 1.102 3136, 2049, 17, 1, 1.1 3200, 0, 0, 0, 1.006 3200, 0, 0, 1, 1.006 3200, 18, 0, 0, 0.978 3200, 18, 0, 1, 0.978 3200, 50, 0, 0, 0.998 3200, 50, 0, 1, 0.998 3200, 0, 18, 0, 0.932 3200, 0, 18, 1, 0.932 3200, 0, 50, 0, 0.93 3200, 0, 50, 1, 0.93 3200, 18, 18, 0, 1.11 3200, 18, 18, 1, 1.11 3200, 50, 50, 0, 0.994 3200, 50, 50, 1, 0.994 3200, 2048, 0, 0, 1.007 3200, 2048, 0, 1, 1.007 3200, 2066, 0, 0, 0.978 3200, 2066, 0, 1, 0.978 3200, 2048, 18, 0, 0.894 3200, 2048, 18, 1, 0.894 3200, 2066, 18, 0, 1.11 3200, 2066, 18, 1, 1.11 3200, 18, 1, 0, 1.002 3200, 18, 1, 1, 1.002 3200, 1, 18, 0, 0.917 3200, 1, 18, 1, 0.917 3200, 50, 1, 0, 0.963 3200, 50, 1, 1, 0.964 3200, 1, 50, 0, 0.888 3200, 1, 50, 1, 0.888 3200, 2066, 1, 0, 1.002 3200, 2066, 1, 1, 1.002 3200, 2049, 18, 0, 0.914 3200, 2049, 18, 1, 0.914 3264, 0, 0, 0, 0.994 3264, 0, 0, 1, 0.994 3264, 19, 0, 0, 0.959 3264, 19, 0, 1, 0.959 3264, 51, 0, 0, 0.994 3264, 51, 0, 1, 0.994 3264, 0, 19, 0, 0.927 3264, 0, 19, 1, 0.927 3264, 0, 51, 0, 0.927 3264, 0, 51, 1, 0.927 3264, 19, 19, 0, 1.1 3264, 19, 19, 1, 1.1 3264, 51, 51, 0, 0.982 3264, 51, 51, 1, 0.982 3264, 2048, 0, 0, 0.994 3264, 2048, 0, 1, 0.994 3264, 2067, 0, 0, 0.959 3264, 2067, 0, 1, 0.959 3264, 2048, 19, 0, 0.891 3264, 2048, 19, 1, 0.891 3264, 2067, 19, 0, 1.099 3264, 2067, 19, 1, 1.099 3264, 19, 1, 0, 0.977 3264, 19, 1, 1, 0.976 3264, 1, 19, 0, 0.921 3264, 1, 19, 1, 0.921 3264, 51, 1, 0, 0.959 3264, 51, 1, 1, 0.959 3264, 1, 51, 0, 0.886 3264, 1, 51, 1, 0.886 3264, 2067, 1, 0, 0.976 3264, 2067, 1, 1, 0.976 3264, 2049, 19, 0, 0.917 3264, 2049, 19, 1, 0.917 3328, 0, 0, 0, 0.996 3328, 0, 0, 1, 0.992 3328, 20, 0, 0, 0.955 3328, 20, 0, 1, 0.955 3328, 52, 0, 0, 0.99 3328, 52, 0, 1, 0.99 3328, 0, 20, 0, 0.926 3328, 0, 20, 1, 0.923 3328, 0, 52, 0, 0.933 3328, 0, 52, 1, 0.933 3328, 20, 20, 0, 1.11 3328, 20, 20, 1, 1.11 3328, 52, 52, 0, 0.988 3328, 52, 52, 1, 0.988 3328, 2048, 0, 0, 0.993 3328, 2048, 0, 1, 0.993 3328, 2068, 0, 0, 0.955 3328, 2068, 0, 1, 0.955 3328, 2048, 20, 0, 0.9 3328, 2048, 20, 1, 0.9 3328, 2068, 20, 0, 1.109 3328, 2068, 20, 1, 1.109 3328, 20, 1, 0, 0.99 3328, 20, 1, 1, 0.99 3328, 1, 20, 0, 0.922 3328, 1, 20, 1, 0.922 3328, 52, 1, 0, 0.972 3328, 52, 1, 1, 0.972 3328, 1, 52, 0, 0.901 3328, 1, 52, 1, 0.901 3328, 2068, 1, 0, 0.99 3328, 2068, 1, 1, 0.99 3328, 2049, 20, 0, 0.918 3328, 2049, 20, 1, 0.918 3392, 0, 0, 0, 0.998 3392, 0, 0, 1, 1.0 3392, 21, 0, 0, 0.964 3392, 21, 0, 1, 0.964 3392, 53, 0, 0, 0.998 3392, 53, 0, 1, 0.998 3392, 0, 21, 0, 0.932 3392, 0, 21, 1, 0.932 3392, 0, 53, 0, 0.93 3392, 0, 53, 1, 0.93 3392, 21, 21, 0, 1.113 3392, 21, 21, 1, 1.113 3392, 53, 53, 0, 0.983 3392, 53, 53, 1, 0.983 3392, 2048, 0, 0, 1.0 3392, 2048, 0, 1, 1.0 3392, 2069, 0, 0, 0.964 3392, 2069, 0, 1, 0.964 3392, 2048, 21, 0, 0.895 3392, 2048, 21, 1, 0.896 3392, 2069, 21, 0, 1.113 3392, 2069, 21, 1, 1.113 3392, 21, 1, 0, 0.994 3392, 21, 1, 1, 0.994 3392, 1, 21, 0, 0.923 3392, 1, 21, 1, 0.923 3392, 53, 1, 0, 0.972 3392, 53, 1, 1, 0.972 3392, 1, 53, 0, 0.891 3392, 1, 53, 1, 0.891 3392, 2069, 1, 0, 0.994 3392, 2069, 1, 1, 0.994 3392, 2049, 21, 0, 0.922 3392, 2049, 21, 1, 0.922 3456, 0, 0, 0, 0.995 3456, 0, 0, 1, 0.995 3456, 22, 0, 0, 0.965 3456, 22, 0, 1, 0.965 3456, 54, 0, 0, 0.996 3456, 54, 0, 1, 0.996 3456, 0, 22, 0, 0.927 3456, 0, 22, 1, 0.927 3456, 0, 54, 0, 0.927 3456, 0, 54, 1, 0.927 3456, 22, 22, 0, 1.107 3456, 22, 22, 1, 1.107 3456, 54, 54, 0, 0.98 3456, 54, 54, 1, 0.98 3456, 2048, 0, 0, 0.995 3456, 2048, 0, 1, 0.995 3456, 2070, 0, 0, 0.965 3456, 2070, 0, 1, 0.965 3456, 2048, 22, 0, 0.893 3456, 2048, 22, 1, 0.893 3456, 2070, 22, 0, 1.107 3456, 2070, 22, 1, 1.107 3456, 22, 1, 0, 0.988 3456, 22, 1, 1, 0.988 3456, 1, 22, 0, 0.921 3456, 1, 22, 1, 0.921 3456, 54, 1, 0, 0.963 3456, 54, 1, 1, 0.963 3456, 1, 54, 0, 0.887 3456, 1, 54, 1, 0.887 3456, 2070, 1, 0, 0.988 3456, 2070, 1, 1, 0.988 3456, 2049, 22, 0, 0.917 3456, 2049, 22, 1, 0.917 3520, 0, 0, 0, 1.016 3520, 0, 0, 1, 1.016 3520, 23, 0, 0, 0.957 3520, 23, 0, 1, 0.957 3520, 55, 0, 0, 0.991 3520, 55, 0, 1, 0.991 3520, 0, 23, 0, 0.919 3520, 0, 23, 1, 0.924 3520, 0, 55, 0, 0.934 3520, 0, 55, 1, 0.934 3520, 23, 23, 0, 1.111 3520, 23, 23, 1, 1.111 3520, 55, 55, 0, 0.994 3520, 55, 55, 1, 0.994 3520, 2048, 0, 0, 1.016 3520, 2048, 0, 1, 1.016 3520, 2071, 0, 0, 0.957 3520, 2071, 0, 1, 0.957 3520, 2048, 23, 0, 0.903 3520, 2048, 23, 1, 0.903 3520, 2071, 23, 0, 1.111 3520, 2071, 23, 1, 1.111 3520, 23, 1, 0, 0.997 3520, 23, 1, 1, 0.997 3520, 1, 23, 0, 0.921 3520, 1, 23, 1, 0.921 3520, 55, 1, 0, 0.976 3520, 55, 1, 1, 0.976 3520, 1, 55, 0, 0.902 3520, 1, 55, 1, 0.902 3520, 2071, 1, 0, 0.997 3520, 2071, 1, 1, 0.997 3520, 2049, 23, 0, 0.918 3520, 2049, 23, 1, 0.918 3584, 0, 0, 0, 1.004 3584, 0, 0, 1, 1.004 3584, 24, 0, 0, 0.985 3584, 24, 0, 1, 0.979 3584, 56, 0, 0, 1.006 3584, 56, 0, 1, 1.006 3584, 0, 24, 0, 0.931 3584, 0, 24, 1, 0.931 3584, 0, 56, 0, 0.93 3584, 0, 56, 1, 0.93 3584, 24, 24, 0, 1.111 3584, 24, 24, 1, 1.11 3584, 56, 56, 0, 1.101 3584, 56, 56, 1, 1.1 3584, 2048, 0, 0, 1.005 3584, 2048, 0, 1, 1.005 3584, 2072, 0, 0, 0.98 3584, 2072, 0, 1, 0.978 3584, 2048, 24, 0, 0.896 3584, 2048, 24, 1, 0.897 3584, 2072, 24, 0, 1.111 3584, 2072, 24, 1, 1.111 3584, 24, 1, 0, 1.004 3584, 24, 1, 1, 1.004 3584, 1, 24, 0, 0.921 3584, 1, 24, 1, 0.921 3584, 56, 1, 0, 0.971 3584, 56, 1, 1, 0.97 3584, 1, 56, 0, 0.89 3584, 1, 56, 1, 0.89 3584, 2072, 1, 0, 1.004 3584, 2072, 1, 1, 1.004 3584, 2049, 24, 0, 0.918 3584, 2049, 24, 1, 0.918 3648, 0, 0, 0, 1.012 3648, 0, 0, 1, 1.012 3648, 25, 0, 0, 0.96 3648, 25, 0, 1, 0.96 3648, 57, 0, 0, 0.988 3648, 57, 0, 1, 0.988 3648, 0, 25, 0, 0.927 3648, 0, 25, 1, 0.927 3648, 0, 57, 0, 0.927 3648, 0, 57, 1, 0.927 3648, 25, 25, 0, 1.101 3648, 25, 25, 1, 1.101 3648, 57, 57, 0, 0.986 3648, 57, 57, 1, 0.986 3648, 2048, 0, 0, 1.012 3648, 2048, 0, 1, 1.012 3648, 2073, 0, 0, 0.96 3648, 2073, 0, 1, 0.959 3648, 2048, 25, 0, 0.894 3648, 2048, 25, 1, 0.895 3648, 2073, 25, 0, 1.103 3648, 2073, 25, 1, 1.103 3648, 25, 1, 0, 1.024 3648, 25, 1, 1, 1.024 3648, 1, 25, 0, 0.911 3648, 1, 25, 1, 0.912 3648, 57, 1, 0, 0.973 3648, 57, 1, 1, 0.974 3648, 1, 57, 0, 0.888 3648, 1, 57, 1, 0.888 3648, 2073, 1, 0, 1.024 3648, 2073, 1, 1, 1.024 3648, 2049, 25, 0, 0.907 3648, 2049, 25, 1, 0.907 3712, 0, 0, 0, 0.996 3712, 0, 0, 1, 0.996 3712, 26, 0, 0, 0.96 3712, 26, 0, 1, 0.96 3712, 58, 0, 0, 0.995 3712, 58, 0, 1, 0.995 3712, 0, 26, 0, 0.919 3712, 0, 26, 1, 0.918 3712, 0, 58, 0, 0.93 3712, 0, 58, 1, 0.93 3712, 26, 26, 0, 1.103 3712, 26, 26, 1, 1.102 3712, 58, 58, 0, 0.989 3712, 58, 58, 1, 0.989 3712, 2048, 0, 0, 0.997 3712, 2048, 0, 1, 0.997 3712, 2074, 0, 0, 0.959 3712, 2074, 0, 1, 0.959 3712, 2048, 26, 0, 0.901 3712, 2048, 26, 1, 0.901 3712, 2074, 26, 0, 1.104 3712, 2074, 26, 1, 1.102 3712, 26, 1, 0, 1.001 3712, 26, 1, 1, 1.001 3712, 1, 26, 0, 0.922 3712, 1, 26, 1, 0.922 3712, 58, 1, 0, 0.974 3712, 58, 1, 1, 0.974 3712, 1, 58, 0, 0.903 3712, 1, 58, 1, 0.903 3712, 2074, 1, 0, 1.001 3712, 2074, 1, 1, 1.001 3712, 2049, 26, 0, 0.919 3712, 2049, 26, 1, 0.919 3776, 0, 0, 0, 1.003 3776, 0, 0, 1, 1.003 3776, 27, 0, 0, 0.964 3776, 27, 0, 1, 0.964 3776, 59, 0, 0, 1.004 3776, 59, 0, 1, 1.004 3776, 0, 27, 0, 0.931 3776, 0, 27, 1, 0.931 3776, 0, 59, 0, 0.929 3776, 0, 59, 1, 0.93 3776, 27, 27, 0, 1.097 3776, 27, 27, 1, 1.097 3776, 59, 59, 0, 0.992 3776, 59, 59, 1, 0.992 3776, 2048, 0, 0, 1.003 3776, 2048, 0, 1, 1.003 3776, 2075, 0, 0, 0.963 3776, 2075, 0, 1, 0.964 3776, 2048, 27, 0, 0.898 3776, 2048, 27, 1, 0.898 3776, 2075, 27, 0, 1.097 3776, 2075, 27, 1, 1.097 3776, 27, 1, 0, 0.998 3776, 27, 1, 1, 0.998 3776, 1, 27, 0, 0.925 3776, 1, 27, 1, 0.925 3776, 59, 1, 0, 0.979 3776, 59, 1, 1, 0.979 3776, 1, 59, 0, 0.894 3776, 1, 59, 1, 0.894 3776, 2075, 1, 0, 0.998 3776, 2075, 1, 1, 0.999 3776, 2049, 27, 0, 0.923 3776, 2049, 27, 1, 0.923 3840, 0, 0, 0, 0.997 3840, 0, 0, 1, 0.997 3840, 28, 0, 0, 0.968 3840, 28, 0, 1, 0.968 3840, 60, 0, 0, 1.001 3840, 60, 0, 1, 1.001 3840, 0, 28, 0, 0.926 3840, 0, 28, 1, 0.927 3840, 0, 60, 0, 0.927 3840, 0, 60, 1, 0.927 3840, 28, 28, 0, 1.094 3840, 28, 28, 1, 1.094 3840, 60, 60, 0, 0.982 3840, 60, 60, 1, 0.982 3840, 2048, 0, 0, 0.998 3840, 2048, 0, 1, 0.998 3840, 2076, 0, 0, 0.968 3840, 2076, 0, 1, 0.968 3840, 2048, 28, 0, 0.896 3840, 2048, 28, 1, 0.896 3840, 2076, 28, 0, 1.094 3840, 2076, 28, 1, 1.094 3840, 28, 1, 0, 0.983 3840, 28, 1, 1, 0.982 3840, 1, 28, 0, 0.916 3840, 1, 28, 1, 0.916 3840, 60, 1, 0, 0.969 3840, 60, 1, 1, 0.969 3840, 1, 60, 0, 0.891 3840, 1, 60, 1, 0.891 3840, 2076, 1, 0, 0.983 3840, 2076, 1, 1, 0.983 3840, 2049, 28, 0, 0.912 3840, 2049, 28, 1, 0.912 3904, 0, 0, 0, 1.002 3904, 0, 0, 1, 1.0 3904, 29, 0, 0, 0.961 3904, 29, 0, 1, 0.961 3904, 61, 0, 0, 0.997 3904, 61, 0, 1, 0.997 3904, 0, 29, 0, 0.915 3904, 0, 29, 1, 0.922 3904, 0, 61, 0, 0.933 3904, 0, 61, 1, 0.933 3904, 29, 29, 0, 1.103 3904, 29, 29, 1, 1.103 3904, 61, 61, 0, 0.995 3904, 61, 61, 1, 0.995 3904, 2048, 0, 0, 0.998 3904, 2048, 0, 1, 1.0 3904, 2077, 0, 0, 0.961 3904, 2077, 0, 1, 0.961 3904, 2048, 29, 0, 0.904 3904, 2048, 29, 1, 0.904 3904, 2077, 29, 0, 1.103 3904, 2077, 29, 1, 1.103 3904, 29, 1, 0, 1.0 3904, 29, 1, 1, 1.0 3904, 1, 29, 0, 0.922 3904, 1, 29, 1, 0.922 3904, 61, 1, 0, 0.98 3904, 61, 1, 1, 0.98 3904, 1, 61, 0, 0.904 3904, 1, 61, 1, 0.904 3904, 2077, 1, 0, 1.0 3904, 2077, 1, 1, 1.0 3904, 2049, 29, 0, 0.919 3904, 2049, 29, 1, 0.919 3968, 0, 0, 0, 1.003 3968, 0, 0, 1, 1.003 3968, 30, 0, 0, 0.969 3968, 30, 0, 1, 0.969 3968, 62, 0, 0, 1.006 3968, 62, 0, 1, 1.006 3968, 0, 30, 0, 0.931 3968, 0, 30, 1, 0.93 3968, 0, 62, 0, 0.929 3968, 0, 62, 1, 0.929 3968, 30, 30, 0, 1.103 3968, 30, 30, 1, 1.103 3968, 62, 62, 0, 0.99 3968, 62, 62, 1, 0.99 3968, 2048, 0, 0, 1.004 3968, 2048, 0, 1, 1.004 3968, 2078, 0, 0, 0.969 3968, 2078, 0, 1, 0.969 3968, 2048, 30, 0, 0.899 3968, 2048, 30, 1, 0.899 3968, 2078, 30, 0, 1.105 3968, 2078, 30, 1, 1.105 3968, 30, 1, 0, 0.993 3968, 30, 1, 1, 0.993 3968, 1, 30, 0, 0.908 3968, 1, 30, 1, 0.908 3968, 62, 1, 0, 0.978 3968, 62, 1, 1, 0.978 3968, 1, 62, 0, 0.895 3968, 1, 62, 1, 0.895 3968, 2078, 1, 0, 0.993 3968, 2078, 1, 1, 0.993 3968, 2049, 30, 0, 0.904 3968, 2049, 30, 1, 0.904 4032, 0, 0, 0, 0.995 4032, 0, 0, 1, 0.995 4032, 31, 0, 0, 0.967 4032, 31, 0, 1, 0.967 4032, 63, 0, 0, 1.002 4032, 63, 0, 1, 1.002 4032, 0, 31, 0, 0.927 4032, 0, 31, 1, 0.926 4032, 0, 63, 0, 0.927 4032, 0, 63, 1, 0.927 4032, 31, 31, 0, 1.09 4032, 31, 31, 1, 1.09 4032, 63, 63, 0, 0.987 4032, 63, 63, 1, 0.987 4032, 2048, 0, 0, 0.995 4032, 2048, 0, 1, 0.995 4032, 2079, 0, 0, 0.967 4032, 2079, 0, 1, 0.967 4032, 2048, 31, 0, 0.897 4032, 2048, 31, 1, 0.897 4032, 2079, 31, 0, 1.09 4032, 2079, 31, 1, 1.09 4032, 31, 1, 0, 0.989 4032, 31, 1, 1, 0.989 4032, 1, 31, 0, 0.911 4032, 1, 31, 1, 0.911 4032, 63, 1, 0, 0.971 4032, 63, 1, 1, 0.972 4032, 1, 63, 0, 0.892 4032, 1, 63, 1, 0.892 4032, 2079, 1, 0, 0.989 4032, 2079, 1, 1, 0.989 4032, 2049, 31, 0, 0.907 4032, 2049, 31, 1, 0.907 4096, 32, 0, 0, 1.014 4096, 32, 0, 1, 1.014 4096, 64, 0, 0, 1.014 4096, 64, 0, 1, 1.014 4096, 0, 32, 0, 1.012 4096, 0, 32, 1, 1.012 4096, 0, 64, 0, 1.012 4096, 0, 64, 1, 1.012 4096, 32, 32, 0, 1.014 4096, 32, 32, 1, 1.014 4096, 64, 64, 0, 1.014 4096, 64, 64, 1, 1.014 4096, 2080, 0, 0, 1.014 4096, 2080, 0, 1, 1.014 4096, 2048, 32, 0, 1.014 4096, 2048, 32, 1, 1.014 4096, 2080, 32, 0, 1.014 4096, 2080, 32, 1, 1.014 4096, 32, 1, 0, 0.975 4096, 32, 1, 1, 0.975 4096, 1, 32, 0, 0.769 4096, 1, 32, 1, 0.769 4096, 64, 1, 0, 0.858 4096, 64, 1, 1, 0.858 4096, 1, 64, 0, 0.769 4096, 1, 64, 1, 0.769 4096, 2080, 1, 0, 0.829 4096, 2080, 1, 1, 0.829 4096, 2049, 32, 0, 0.886 4096, 2049, 32, 1, 0.886 4160, 0, 0, 0, 1.003 4160, 0, 0, 1, 1.003 4160, 33, 0, 0, 1.004 4160, 33, 0, 1, 1.004 4160, 65, 0, 0, 0.999 4160, 65, 0, 1, 0.999 4160, 0, 33, 0, 0.931 4160, 0, 33, 1, 0.931 4160, 0, 65, 0, 0.765 4160, 0, 65, 1, 0.765 4160, 33, 33, 0, 0.998 4160, 33, 33, 1, 0.998 4160, 65, 65, 0, 0.942 4160, 65, 65, 1, 0.942 4160, 2048, 0, 0, 1.003 4160, 2048, 0, 1, 1.003 4160, 2081, 0, 0, 1.004 4160, 2081, 0, 1, 1.004 4160, 2048, 33, 0, 0.899 4160, 2048, 33, 1, 0.898 4160, 2081, 33, 0, 1.002 4160, 2081, 33, 1, 1.002 4160, 33, 1, 0, 1.114 4160, 33, 1, 1, 1.114 4160, 1, 33, 0, 1.01 4160, 1, 33, 1, 1.01 4160, 65, 1, 0, 1.077 4160, 65, 1, 1, 1.077 4160, 1, 65, 0, 0.935 4160, 1, 65, 1, 0.935 4160, 2081, 1, 0, 1.077 4160, 2081, 1, 1, 1.077 4160, 2049, 33, 0, 1.007 4160, 2049, 33, 1, 1.007 4224, 0, 0, 0, 1.014 4224, 0, 0, 1, 1.014 4224, 34, 0, 0, 1.0 4224, 34, 0, 1, 1.0 4224, 66, 0, 0, 1.001 4224, 66, 0, 1, 1.001 4224, 0, 34, 0, 0.928 4224, 0, 34, 1, 0.928 4224, 0, 66, 0, 0.762 4224, 0, 66, 1, 0.762 4224, 34, 34, 0, 0.998 4224, 34, 34, 1, 0.998 4224, 66, 66, 0, 0.959 4224, 66, 66, 1, 0.959 4224, 2048, 0, 0, 1.014 4224, 2048, 0, 1, 1.014 4224, 2082, 0, 0, 1.001 4224, 2082, 0, 1, 1.001 4224, 2048, 34, 0, 0.899 4224, 2048, 34, 1, 0.898 4224, 2082, 34, 0, 0.998 4224, 2082, 34, 1, 0.998 4224, 34, 1, 0, 1.024 4224, 34, 1, 1, 1.023 4224, 1, 34, 0, 0.917 4224, 1, 34, 1, 0.917 4224, 66, 1, 0, 1.012 4224, 66, 1, 1, 1.013 4224, 1, 66, 0, 0.917 4224, 1, 66, 1, 0.917 4224, 2082, 1, 0, 1.022 4224, 2082, 1, 1, 1.022 4224, 2049, 34, 0, 0.914 4224, 2049, 34, 1, 0.914 4288, 0, 0, 0, 0.999 4288, 0, 0, 1, 0.999 4288, 35, 0, 0, 0.995 4288, 35, 0, 1, 0.996 4288, 67, 0, 0, 0.998 4288, 67, 0, 1, 0.998 4288, 0, 35, 0, 0.919 4288, 0, 35, 1, 0.918 4288, 0, 67, 0, 0.767 4288, 0, 67, 1, 0.767 4288, 35, 35, 0, 1.005 4288, 35, 35, 1, 1.004 4288, 67, 67, 0, 0.995 4288, 67, 67, 1, 0.995 4288, 2048, 0, 0, 0.999 4288, 2048, 0, 1, 0.999 4288, 2083, 0, 0, 0.995 4288, 2083, 0, 1, 0.995 4288, 2048, 35, 0, 0.905 4288, 2048, 35, 1, 0.904 4288, 2083, 35, 0, 1.005 4288, 2083, 35, 1, 1.004 4288, 35, 1, 0, 1.033 4288, 35, 1, 1, 1.032 4288, 1, 35, 0, 0.928 4288, 1, 35, 1, 0.928 4288, 67, 1, 0, 1.019 4288, 67, 1, 1, 1.02 4288, 1, 67, 0, 0.925 4288, 1, 67, 1, 0.924 4288, 2083, 1, 0, 1.03 4288, 2083, 1, 1, 1.03 4288, 2049, 35, 0, 0.925 4288, 2049, 35, 1, 0.926 4352, 0, 0, 0, 1.005 4352, 0, 0, 1, 1.005 4352, 36, 0, 0, 1.007 4352, 36, 0, 1, 1.006 4352, 68, 0, 0, 1.007 4352, 68, 0, 1, 1.008 4352, 0, 36, 0, 0.929 4352, 0, 36, 1, 0.929 4352, 0, 68, 0, 0.766 4352, 0, 68, 1, 0.766 4352, 36, 36, 0, 0.998 4352, 36, 36, 1, 0.998 4352, 68, 68, 0, 0.964 4352, 68, 68, 1, 0.964 4352, 2048, 0, 0, 1.006 4352, 2048, 0, 1, 1.006 4352, 2084, 0, 0, 1.006 4352, 2084, 0, 1, 1.006 4352, 2048, 36, 0, 0.897 4352, 2048, 36, 1, 0.898 4352, 2084, 36, 0, 0.998 4352, 2084, 36, 1, 0.998 4352, 36, 1, 0, 1.031 4352, 36, 1, 1, 1.031 4352, 1, 36, 0, 0.924 4352, 1, 36, 1, 0.924 4352, 68, 1, 0, 0.999 4352, 68, 1, 1, 0.999 4352, 1, 68, 0, 0.922 4352, 1, 68, 1, 0.922 4352, 2084, 1, 0, 1.03 4352, 2084, 1, 1, 1.03 4352, 2049, 36, 0, 0.922 4352, 2049, 36, 1, 0.922 4416, 0, 0, 0, 0.997 4416, 0, 0, 1, 0.997 4416, 37, 0, 0, 1.002 4416, 37, 0, 1, 1.002 4416, 69, 0, 0, 1.004 4416, 69, 0, 1, 1.004 4416, 0, 37, 0, 0.928 4416, 0, 37, 1, 0.927 4416, 0, 69, 0, 0.762 4416, 0, 69, 1, 0.762 4416, 37, 37, 0, 0.994 4416, 37, 37, 1, 0.994 4416, 69, 69, 0, 0.959 4416, 69, 69, 1, 0.959 4416, 2048, 0, 0, 0.997 4416, 2048, 0, 1, 0.997 4416, 2085, 0, 0, 1.001 4416, 2085, 0, 1, 1.001 4416, 2048, 37, 0, 0.899 4416, 2048, 37, 1, 0.899 4416, 2085, 37, 0, 0.994 4416, 2085, 37, 1, 0.994 4416, 37, 1, 0, 1.024 4416, 37, 1, 1, 1.023 4416, 1, 37, 0, 0.923 4416, 1, 37, 1, 0.922 4416, 69, 1, 0, 1.009 4416, 69, 1, 1, 1.01 4416, 1, 69, 0, 0.917 4416, 1, 69, 1, 0.917 4416, 2085, 1, 0, 1.024 4416, 2085, 1, 1, 1.024 4416, 2049, 37, 0, 0.919 4416, 2049, 37, 1, 0.919 4480, 0, 0, 0, 1.0 4480, 0, 0, 1, 0.999 4480, 38, 0, 0, 0.996 4480, 38, 0, 1, 0.996 4480, 70, 0, 0, 1.0 4480, 70, 0, 1, 1.0 4480, 0, 38, 0, 0.919 4480, 0, 38, 1, 0.921 4480, 0, 70, 0, 0.767 4480, 0, 70, 1, 0.767 4480, 38, 38, 0, 1.002 4480, 38, 38, 1, 1.002 4480, 70, 70, 0, 0.963 4480, 70, 70, 1, 0.963 4480, 2048, 0, 0, 0.998 4480, 2048, 0, 1, 0.999 4480, 2086, 0, 0, 0.996 4480, 2086, 0, 1, 0.995 4480, 2048, 38, 0, 0.907 4480, 2048, 38, 1, 0.907 4480, 2086, 38, 0, 1.002 4480, 2086, 38, 1, 1.002 4480, 38, 1, 0, 1.032 4480, 38, 1, 1, 1.031 4480, 1, 38, 0, 0.919 4480, 1, 38, 1, 0.92 4480, 70, 1, 0, 1.018 4480, 70, 1, 1, 1.017 4480, 1, 70, 0, 0.916 4480, 1, 70, 1, 0.915 4480, 2086, 1, 0, 1.031 4480, 2086, 1, 1, 1.03 4480, 2049, 38, 0, 0.917 4480, 2049, 38, 1, 0.918 4544, 0, 0, 0, 1.002 4544, 0, 0, 1, 1.002 4544, 39, 0, 0, 1.007 4544, 39, 0, 1, 1.008 4544, 71, 0, 0, 1.002 4544, 71, 0, 1, 1.002 4544, 0, 39, 0, 0.93 4544, 0, 39, 1, 0.931 4544, 0, 71, 0, 0.766 4544, 0, 71, 1, 0.766 4544, 39, 39, 0, 1.001 4544, 39, 39, 1, 1.001 4544, 71, 71, 0, 0.966 4544, 71, 71, 1, 0.966 4544, 2048, 0, 0, 1.002 4544, 2048, 0, 1, 1.002 4544, 2087, 0, 0, 1.008 4544, 2087, 0, 1, 1.007 4544, 2048, 39, 0, 0.901 4544, 2048, 39, 1, 0.901 4544, 2087, 39, 0, 1.001 4544, 2087, 39, 1, 1.001 4544, 39, 1, 0, 1.025 4544, 39, 1, 1, 1.025 4544, 1, 39, 0, 0.919 4544, 1, 39, 1, 0.919 4544, 71, 1, 0, 0.991 4544, 71, 1, 1, 0.991 4544, 1, 71, 0, 0.921 4544, 1, 71, 1, 0.922 4544, 2087, 1, 0, 1.025 4544, 2087, 1, 1, 1.025 4544, 2049, 39, 0, 0.917 4544, 2049, 39, 1, 0.917 4608, 0, 0, 0, 0.997 4608, 0, 0, 1, 0.997 4608, 40, 0, 0, 1.013 4608, 40, 0, 1, 1.013 4608, 72, 0, 0, 1.013 4608, 72, 0, 1, 1.013 4608, 0, 40, 0, 0.925 4608, 0, 40, 1, 0.926 4608, 0, 72, 0, 0.765 4608, 0, 72, 1, 0.765 4608, 40, 40, 0, 1.084 4608, 40, 40, 1, 1.084 4608, 72, 72, 0, 0.966 4608, 72, 72, 1, 0.966 4608, 2048, 0, 0, 0.999 4608, 2048, 0, 1, 0.999 4608, 2088, 0, 0, 1.012 4608, 2088, 0, 1, 1.012 4608, 2048, 40, 0, 0.898 4608, 2048, 40, 1, 0.898 4608, 2088, 40, 0, 1.087 4608, 2088, 40, 1, 1.087 4608, 40, 1, 0, 1.006 4608, 40, 1, 1, 1.006 4608, 1, 40, 0, 0.926 4608, 1, 40, 1, 0.925 4608, 72, 1, 0, 1.012 4608, 72, 1, 1, 1.011 4608, 1, 72, 0, 0.92 4608, 1, 72, 1, 0.92 4608, 2088, 1, 0, 1.006 4608, 2088, 1, 1, 1.006 4608, 2049, 40, 0, 0.923 4608, 2049, 40, 1, 0.923 4672, 0, 0, 0, 1.014 4672, 0, 0, 1, 1.014 4672, 41, 0, 0, 1.003 4672, 41, 0, 1, 1.003 4672, 73, 0, 0, 0.983 4672, 73, 0, 1, 0.982 4672, 0, 41, 0, 0.916 4672, 0, 41, 1, 0.918 4672, 0, 73, 0, 0.772 4672, 0, 73, 1, 0.772 4672, 41, 41, 0, 1.012 4672, 41, 41, 1, 1.012 4672, 73, 73, 0, 0.973 4672, 73, 73, 1, 0.973 4672, 2048, 0, 0, 1.014 4672, 2048, 0, 1, 1.014 4672, 2089, 0, 0, 1.002 4672, 2089, 0, 1, 1.002 4672, 2048, 41, 0, 0.907 4672, 2048, 41, 1, 0.908 4672, 2089, 41, 0, 1.012 4672, 2089, 41, 1, 1.012 4672, 41, 1, 0, 1.027 4672, 41, 1, 1, 1.027 4672, 1, 41, 0, 0.928 4672, 1, 41, 1, 0.927 4672, 73, 1, 0, 1.032 4672, 73, 1, 1, 1.03 4672, 1, 73, 0, 0.927 4672, 1, 73, 1, 0.927 4672, 2089, 1, 0, 1.026 4672, 2089, 1, 1, 1.027 4672, 2049, 41, 0, 0.925 4672, 2049, 41, 1, 0.925 4736, 0, 0, 0, 1.005 4736, 0, 0, 1, 1.005 4736, 42, 0, 0, 1.012 4736, 42, 0, 1, 1.012 4736, 74, 0, 0, 0.976 4736, 74, 0, 1, 0.975 4736, 0, 42, 0, 0.93 4736, 0, 42, 1, 0.93 4736, 0, 74, 0, 0.77 4736, 0, 74, 1, 0.77 4736, 42, 42, 0, 1.007 4736, 42, 42, 1, 1.007 4736, 74, 74, 0, 0.965 4736, 74, 74, 1, 0.965 4736, 2048, 0, 0, 1.006 4736, 2048, 0, 1, 1.006 4736, 2090, 0, 0, 1.013 4736, 2090, 0, 1, 1.013 4736, 2048, 42, 0, 0.902 4736, 2048, 42, 1, 0.902 4736, 2090, 42, 0, 1.007 4736, 2090, 42, 1, 1.007 4736, 42, 1, 0, 1.032 4736, 42, 1, 1, 1.032 4736, 1, 42, 0, 0.925 4736, 1, 42, 1, 0.925 4736, 74, 1, 0, 1.018 4736, 74, 1, 1, 1.018 4736, 1, 74, 0, 0.912 4736, 1, 74, 1, 0.912 4736, 2090, 1, 0, 1.032 4736, 2090, 1, 1, 1.032 4736, 2049, 42, 0, 0.923 4736, 2049, 42, 1, 0.923 4800, 0, 0, 0, 1.012 4800, 0, 0, 1, 1.012 4800, 43, 0, 0, 1.008 4800, 43, 0, 1, 1.008 4800, 75, 0, 0, 0.99 4800, 75, 0, 1, 0.99 4800, 0, 43, 0, 0.928 4800, 0, 43, 1, 0.928 4800, 0, 75, 0, 0.767 4800, 0, 75, 1, 0.768 4800, 43, 43, 0, 1.004 4800, 43, 43, 1, 1.004 4800, 75, 75, 0, 0.965 4800, 75, 75, 1, 0.965 4800, 2048, 0, 0, 1.012 4800, 2048, 0, 1, 1.012 4800, 2091, 0, 0, 1.009 4800, 2091, 0, 1, 1.008 4800, 2048, 43, 0, 0.902 4800, 2048, 43, 1, 0.902 4800, 2091, 43, 0, 1.004 4800, 2091, 43, 1, 1.004 4800, 43, 1, 0, 1.026 4800, 43, 1, 1, 1.025 4800, 1, 43, 0, 0.91 4800, 1, 43, 1, 0.91 4800, 75, 1, 0, 0.992 4800, 75, 1, 1, 0.992 4800, 1, 75, 0, 0.921 4800, 1, 75, 1, 0.92 4800, 2091, 1, 0, 1.025 4800, 2091, 1, 1, 1.025 4800, 2049, 43, 0, 0.907 4800, 2049, 43, 1, 0.907 4864, 0, 0, 0, 0.998 4864, 0, 0, 1, 0.998 4864, 44, 0, 0, 1.003 4864, 44, 0, 1, 1.004 4864, 76, 0, 0, 0.987 4864, 76, 0, 1, 0.987 4864, 0, 44, 0, 0.92 4864, 0, 44, 1, 0.921 4864, 0, 76, 0, 0.933 4864, 0, 76, 1, 0.932 4864, 44, 44, 0, 1.006 4864, 44, 44, 1, 1.004 4864, 76, 76, 0, 0.976 4864, 76, 76, 1, 0.975 4864, 2048, 0, 0, 0.999 4864, 2048, 0, 1, 0.999 4864, 2092, 0, 0, 1.004 4864, 2092, 0, 1, 1.005 4864, 2048, 44, 0, 0.907 4864, 2048, 44, 1, 0.907 4864, 2092, 44, 0, 1.006 4864, 2092, 44, 1, 1.005 4864, 44, 1, 0, 1.034 4864, 44, 1, 1, 1.032 4864, 1, 44, 0, 0.908 4864, 1, 44, 1, 0.929 4864, 76, 1, 0, 1.006 4864, 76, 1, 1, 1.005 4864, 1, 76, 0, 0.798 4864, 1, 76, 1, 0.798 4864, 2092, 1, 0, 1.033 4864, 2092, 1, 1, 1.033 4864, 2049, 44, 0, 0.904 4864, 2049, 44, 1, 0.925 4928, 0, 0, 0, 1.005 4928, 0, 0, 1, 1.005 4928, 45, 0, 0, 0.993 4928, 45, 0, 1, 1.012 4928, 77, 0, 0, 0.956 4928, 77, 0, 1, 0.976 4928, 0, 45, 0, 0.933 4928, 0, 45, 1, 0.932 4928, 0, 77, 0, 0.771 4928, 0, 77, 1, 0.771 4928, 45, 45, 0, 1.015 4928, 45, 45, 1, 1.015 4928, 77, 77, 0, 0.972 4928, 77, 77, 1, 0.972 4928, 2048, 0, 0, 1.005 4928, 2048, 0, 1, 1.005 4928, 2093, 0, 0, 0.992 4928, 2093, 0, 1, 1.012 4928, 2048, 45, 0, 0.932 4928, 2048, 45, 1, 0.931 4928, 2093, 45, 0, 1.015 4928, 2093, 45, 1, 1.015 4928, 45, 1, 0, 1.009 4928, 45, 1, 1, 1.032 4928, 1, 45, 0, 0.806 4928, 1, 45, 1, 0.805 4928, 77, 1, 0, 0.981 4928, 77, 1, 1, 1.005 4928, 1, 77, 0, 0.917 4928, 1, 77, 1, 0.917 4928, 2093, 1, 0, 1.008 4928, 2093, 1, 1, 1.032 4928, 2049, 45, 0, 0.794 4928, 2049, 45, 1, 0.794 4992, 0, 0, 0, 0.999 4992, 0, 0, 1, 0.999 4992, 46, 0, 0, 0.985 4992, 46, 0, 1, 1.008 4992, 78, 0, 0, 0.963 4992, 78, 0, 1, 0.984 4992, 0, 46, 0, 0.908 4992, 0, 46, 1, 0.908 4992, 0, 78, 0, 0.752 4992, 0, 78, 1, 0.751 4992, 46, 46, 0, 0.997 4992, 46, 46, 1, 0.997 4992, 78, 78, 0, 0.969 4992, 78, 78, 1, 0.968 4992, 2048, 0, 0, 1.0 4992, 2048, 0, 1, 1.0 4992, 2094, 0, 0, 0.987 4992, 2094, 0, 1, 1.008 4992, 2048, 46, 0, 0.883 4992, 2048, 46, 1, 0.883 4992, 2094, 46, 0, 0.997 4992, 2094, 46, 1, 0.997 4992, 46, 1, 0, 0.998 4992, 46, 1, 1, 1.02 4992, 1, 46, 0, 0.917 4992, 1, 46, 1, 0.917 4992, 78, 1, 0, 0.972 4992, 78, 1, 1, 0.993 4992, 1, 78, 0, 0.919 4992, 1, 78, 1, 0.92 4992, 2094, 1, 0, 0.997 4992, 2094, 1, 1, 1.019 4992, 2049, 46, 0, 0.914 4992, 2049, 46, 1, 0.914 5056, 0, 0, 0, 1.002 5056, 0, 0, 1, 1.0 5056, 47, 0, 0, 1.005 5056, 47, 0, 1, 1.005 5056, 79, 0, 0, 0.989 5056, 79, 0, 1, 0.989 5056, 0, 47, 0, 0.918 5056, 0, 47, 1, 0.919 5056, 0, 79, 0, 0.772 5056, 0, 79, 1, 0.771 5056, 47, 47, 0, 1.006 5056, 47, 47, 1, 1.006 5056, 79, 79, 0, 0.972 5056, 79, 79, 1, 0.972 5056, 2048, 0, 0, 1.001 5056, 2048, 0, 1, 1.0 5056, 2095, 0, 0, 1.004 5056, 2095, 0, 1, 1.004 5056, 2048, 47, 0, 0.908 5056, 2048, 47, 1, 0.909 5056, 2095, 47, 0, 1.006 5056, 2095, 47, 1, 1.006 5056, 47, 1, 0, 1.033 5056, 47, 1, 1, 1.033 5056, 1, 47, 0, 0.919 5056, 1, 47, 1, 0.919 5056, 79, 1, 0, 1.003 5056, 79, 1, 1, 1.005 5056, 1, 79, 0, 0.921 5056, 1, 79, 1, 0.921 5056, 2095, 1, 0, 1.032 5056, 2095, 1, 1, 1.034 5056, 2049, 47, 0, 0.918 5056, 2049, 47, 1, 0.917 5120, 0, 0, 0, 1.003 5120, 0, 0, 1, 1.003 5120, 48, 0, 0, 1.068 5120, 48, 0, 1, 1.068 5120, 80, 0, 0, 1.068 5120, 80, 0, 1, 1.068 5120, 0, 48, 0, 1.065 5120, 0, 48, 1, 1.065 5120, 0, 80, 0, 1.064 5120, 0, 80, 1, 1.065 5120, 48, 48, 0, 1.004 5120, 48, 48, 1, 1.004 5120, 80, 80, 0, 1.005 5120, 80, 80, 1, 1.005 5120, 2048, 0, 0, 1.005 5120, 2048, 0, 1, 1.005 5120, 2096, 0, 0, 1.068 5120, 2096, 0, 1, 1.068 5120, 2048, 48, 0, 1.065 5120, 2048, 48, 1, 1.065 5120, 2096, 48, 0, 1.005 5120, 2096, 48, 1, 1.005 5120, 48, 1, 0, 1.033 5120, 48, 1, 1, 1.031 5120, 1, 48, 0, 0.898 5120, 1, 48, 1, 0.898 5120, 80, 1, 0, 0.844 5120, 80, 1, 1, 0.844 5120, 1, 80, 0, 0.898 5120, 1, 80, 1, 0.898 5120, 2096, 1, 0, 0.856 5120, 2096, 1, 1, 0.855 5120, 2049, 48, 0, 0.898 5120, 2049, 48, 1, 0.898 bench-memcpy-random: length, New Time / Old Time 32768, 0.866 65536, 0.891 131072, 0.896 262144, 0.901 524288, 0.904 1048576, 0.913 bench-memcpy-large: length, align0, align1, dst>src, New Time/Old Time 65543, 0, 0, 0, 0.981 65543, 0, 0, 1, 0.981 65551, 0, 3, 0, 1.012 65551, 0, 3, 1, 1.013 65567, 3, 0, 0, 1.019 65567, 3, 0, 1, 1.02 65599, 3, 5, 0, 1.058 65599, 3, 5, 1, 1.061 65536, 0, 127, 0, 1.046 65536, 0, 127, 1, 1.046 65536, 0, 255, 0, 1.071 65536, 0, 255, 1, 1.071 65536, 0, 256, 0, 0.983 65536, 0, 256, 1, 0.984 65536, 0, 4064, 0, 1.017 65536, 0, 4064, 1, 1.018 131079, 0, 0, 0, 0.981 131079, 0, 0, 1, 0.981 131087, 0, 3, 0, 1.017 131087, 0, 3, 1, 1.017 131103, 3, 0, 0, 1.022 131103, 3, 0, 1, 1.022 131135, 3, 5, 0, 1.064 131135, 3, 5, 1, 1.065 131072, 0, 127, 0, 1.05 131072, 0, 127, 1, 1.05 131072, 0, 255, 0, 1.074 131072, 0, 255, 1, 1.074 131072, 0, 256, 0, 0.984 131072, 0, 256, 1, 0.984 131072, 0, 4064, 0, 1.018 131072, 0, 4064, 1, 1.019 262151, 0, 0, 0, 0.985 262151, 0, 0, 1, 0.985 262159, 0, 3, 0, 1.026 262159, 0, 3, 1, 1.026 262175, 3, 0, 0, 1.03 262175, 3, 0, 1, 1.03 262207, 3, 5, 0, 1.07 262207, 3, 5, 1, 1.07 262144, 0, 127, 0, 1.057 262144, 0, 127, 1, 1.057 262144, 0, 255, 0, 1.079 262144, 0, 255, 1, 1.078 262144, 0, 256, 0, 0.988 262144, 0, 256, 1, 0.988 262144, 0, 4064, 0, 1.02 262144, 0, 4064, 1, 1.02 524295, 0, 0, 0, 0.692 524295, 0, 0, 1, 0.692 524303, 0, 3, 0, 0.736 524303, 0, 3, 1, 0.737 524319, 3, 0, 0, 0.758 524319, 3, 0, 1, 0.759 524351, 3, 5, 0, 0.759 524351, 3, 5, 1, 0.759 524288, 0, 127, 0, 1.057 524288, 0, 127, 1, 1.058 524288, 0, 255, 0, 1.079 524288, 0, 255, 1, 1.079 524288, 0, 256, 0, 0.988 524288, 0, 256, 1, 0.988 524288, 0, 4064, 0, 1.02 524288, 0, 4064, 1, 1.02 1048583, 0, 0, 0, 0.948 1048583, 0, 0, 1, 0.948 1048591, 0, 3, 0, 0.735 1048591, 0, 3, 1, 0.735 1048607, 3, 0, 0, 0.757 1048607, 3, 0, 1, 0.758 1048639, 3, 5, 0, 0.758 1048639, 3, 5, 1, 0.758 1048576, 0, 127, 0, 0.761 1048576, 0, 127, 1, 0.762 1048576, 0, 255, 0, 0.751 1048576, 0, 255, 1, 0.751 1048576, 0, 256, 0, 0.93 1048576, 0, 256, 1, 0.93 1048576, 0, 4064, 0, 0.93 1048576, 0, 4064, 1, 0.93 2097159, 0, 0, 0, 0.928 2097159, 0, 0, 1, 0.931 2097167, 0, 3, 0, 0.735 2097167, 0, 3, 1, 0.734 2097183, 3, 0, 0, 0.759 2097183, 3, 0, 1, 0.759 2097215, 3, 5, 0, 0.758 2097215, 3, 5, 1, 0.757 2097152, 0, 127, 0, 0.77 2097152, 0, 127, 1, 0.77 2097152, 0, 255, 0, 0.745 2097152, 0, 255, 1, 0.745 2097152, 0, 256, 0, 0.924 2097152, 0, 256, 1, 0.925 2097152, 0, 4064, 0, 0.926 2097152, 0, 4064, 1, 0.927 4194311, 0, 0, 0, 0.894 4194311, 0, 0, 1, 0.896 4194319, 0, 3, 0, 0.752 4194319, 0, 3, 1, 0.751 4194335, 3, 0, 0, 0.82 4194335, 3, 0, 1, 0.821 4194367, 3, 5, 0, 0.788 4194367, 3, 5, 1, 0.789 4194304, 0, 127, 0, 0.801 4194304, 0, 127, 1, 0.801 4194304, 0, 255, 0, 0.802 4194304, 0, 255, 1, 0.804 4194304, 0, 256, 0, 0.873 4194304, 0, 256, 1, 0.868 4194304, 0, 4064, 0, 0.955 4194304, 0, 4064, 1, 0.954 8388615, 0, 0, 0, 0.885 8388615, 0, 0, 1, 0.886 8388623, 0, 3, 0, 0.769 8388623, 0, 3, 1, 0.769 8388639, 3, 0, 0, 0.87 8388639, 3, 0, 1, 0.87 8388671, 3, 5, 0, 0.811 8388671, 3, 5, 1, 0.814 8388608, 0, 127, 0, 0.83 8388608, 0, 127, 1, 0.83 8388608, 0, 255, 0, 0.857 8388608, 0, 255, 1, 0.857 8388608, 0, 256, 0, 0.851 8388608, 0, 256, 1, 0.848 8388608, 0, 4064, 0, 0.981 8388608, 0, 4064, 1, 0.981 16777223, 0, 0, 0, 0.885 16777223, 0, 0, 1, 0.886 16777231, 0, 3, 0, 0.769 16777231, 0, 3, 1, 0.768 16777247, 3, 0, 0, 0.87 16777247, 3, 0, 1, 0.87 16777279, 3, 5, 0, 0.811 16777279, 3, 5, 1, 0.814 16777216, 0, 127, 0, 0.831 16777216, 0, 127, 1, 0.83 16777216, 0, 255, 0, 0.857 16777216, 0, 255, 1, 0.857 16777216, 0, 256, 0, 0.852 16777216, 0, 256, 1, 0.848 16777216, 0, 4064, 0, 0.98 16777216, 0, 4064, 1, 0.981 33554439, 0, 0, 0, 0.885 33554439, 0, 0, 1, 0.886 33554447, 0, 3, 0, 0.768 33554447, 0, 3, 1, 0.768 33554463, 3, 0, 0, 0.871 33554463, 3, 0, 1, 0.87 33554495, 3, 5, 0, 0.811 33554495, 3, 5, 1, 0.814 33554432, 0, 127, 0, 0.831 33554432, 0, 127, 1, 0.831 33554432, 0, 255, 0, 0.858 33554432, 0, 255, 1, 0.857 33554432, 0, 256, 0, 0.852 33554432, 0, 256, 1, 0.848 33554432, 0, 4064, 0, 0.98 33554432, 0, 4064, 1, 0.981 sysdeps/x86_64/multiarch/Makefile | 1 - sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ---------------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++- 3 files changed, 382 insertions(+), 3156 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 303fb5d734..e7ea963fc0 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,6 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3 \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S deleted file mode 100644 index 65644d3a09..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3151 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif -L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(80bytesormore): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - - .p2align 4 -L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - - .p2align 4 -L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_bwd): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) -L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 -L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 -L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 -L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 -L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 -L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 -L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 -L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 -L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 -L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 -L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 -L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 -L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 -L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 -L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 -L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 -L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 -L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 -L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 -L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 -L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 -L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 -L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 -L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 -L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 -L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 -L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 -L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 -L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 -L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 -L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 -L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 -L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -L(write_0bytes): - ret - - .p2align 4 -L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif -L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#endif - .p2align 4 -L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif -L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - - .p2align 3 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 295430b1ef..84e4e0f6cb 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -1,4 +1,382 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" +#include <sysdep.h> + +#ifndef MEMMOVE +# define MEMMOVE __memmove_ssse3 +# define MEMMOVE_CHK __memmove_chk_ssse3 +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + + .section .text.ssse3, "ax", @progbits +ENTRY(MEMPCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMPCPY_CHK) + +ENTRY(MEMPCPY) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END(MEMPCPY) + +ENTRY(MEMMOVE_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMMOVE_CHK) + +ENTRY_P2ALIGN(MEMMOVE, 6) + movq %rdi, %rax +L(start): + cmpq $16, %rdx + jb L(copy_0_15) + + /* These loads are always useful. */ + movups 0(%rsi), %xmm0 + movups -16(%rsi, %rdx), %xmm7 + cmpq $32, %rdx + ja L(more_2x_vec) + + movups %xmm0, 0(%rdi) + movups %xmm7, -16(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_4x_vec): + movups 16(%rsi), %xmm1 + movups -32(%rsi, %rdx), %xmm2 + + movups %xmm0, 0(%rdi) + movups %xmm1, 16(%rdi) + movups %xmm2, -32(%rdi, %rdx) + movups %xmm7, -16(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_0_15): + cmpl $8, %edx + ja L(copy_9_15) + + cmpl $4, %edx + jb L(copy_0_3) + + movl 0(%rsi), %ecx + movl -4(%rsi, %rdx), %esi + movl %ecx, 0(%rdi) + movl %esi, -4(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_9_15): + movq 0(%rsi), %rcx + movq -8(%rsi, %rdx), %rsi + movq %rcx, 0(%rdi) + movq %rsi, -8(%rdi, %rdx) + ret + + .p2align 4,, 4 +L(copy_0_3): + cmpl $1, %edx + jl L(copy_0_0) + movzbl (%rsi), %ecx + je L(copy_0_1) + + movzwl -2(%rsi, %rdx), %esi + movw %si, -2(%rdi, %rdx) +L(copy_0_1): + movb %cl, (%rdi) +L(copy_0_0): +L(nop): + ret + + .p2align 4 +L(more_2x_vec): + cmpq $64, %rdx + jbe L(copy_4x_vec) + + /* We use rcx later to get alignr value. */ + movq %rdi, %rcx + + /* Backward copy for overlap + dst > src for memmove safety. */ + subq %rsi, %rcx + cmpq %rdx, %rcx + jb L(copy_backward) + + /* Load tail. */ + + /* -16(%rsi, %rdx) already loaded into xmm7. */ + movups -32(%rsi, %rdx), %xmm8 + movups -48(%rsi, %rdx), %xmm9 + + /* Get misalignment. */ + andl $0xf, %ecx + + movq %rsi, %r9 + addq %rcx, %rsi + andq $-16, %rsi + /* Get first vec for `palignr`. */ + movaps (%rsi), %xmm1 + + /* We have loaded (%rsi) so safe to do this store before the + loop. */ + movups %xmm0, (%rdi) + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_shared_cache_size_half(%rip), %rdx +#endif + ja L(large_memcpy) + + leaq -64(%rdi, %rdx), %r8 + andq $-16, %rdi + movl $48, %edx + + leaq L(loop_fwd_start)(%rip), %r9 + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(copy_backward): + testq %rcx, %rcx + jz L(nop) + + /* Preload tail. */ + + /* (%rsi) already loaded into xmm0. */ + movups 16(%rsi), %xmm4 + movups 32(%rsi), %xmm5 + + movq %rdi, %r8 + subq %rdi, %rsi + leaq -49(%rdi, %rdx), %rdi + andq $-16, %rdi + addq %rdi, %rsi + andq $-16, %rsi + + movaps 48(%rsi), %xmm6 + + + leaq L(loop_bkwd_start)(%rip), %r9 + andl $0xf, %ecx + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(large_memcpy): + movups -64(%r9, %rdx), %xmm10 + movups -80(%r9, %rdx), %xmm11 + + sall $5, %ecx + leal (%rcx, %rcx, 2), %r8d + leaq -96(%rdi, %rdx), %rcx + andq $-16, %rdi + leaq L(large_loop_fwd_start)(%rip), %rdx + addq %r8, %rdx + jmp * %rdx + + + /* Instead of a typical jump table all 16 loops are exactly + 64-bytes in size. So, we can just jump to first loop + r8 * + 64. Before modifying any loop ensure all their sizes match! + */ + .p2align 6 +L(loop_fwd_start): +L(loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + addq %rdx, %rdi + addq %rdx, %rsi + cmpq %rdi, %r8 + ja L(loop_fwd_0x0) +L(end_loop_fwd): + movups %xmm9, 16(%r8) + movups %xmm8, 32(%r8) + movups %xmm7, 48(%r8) + ret + + /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_FWD(align_by); \ + .p2align 6; \ +L(loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm4, %xmm1; \ + movaps %xmm0, 16(%rdi); \ + movaps %xmm2, 32(%rdi); \ + movaps %xmm3, 48(%rdi); \ + addq %rdx, %rdi; \ + addq %rdx, %rsi; \ + cmpq %rdi, %r8; \ + ja L(loop_fwd_ ## align_by); \ + jmp L(end_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_FWD (0xf) + ALIGNED_LOOP_FWD (0xe) + ALIGNED_LOOP_FWD (0xd) + ALIGNED_LOOP_FWD (0xc) + ALIGNED_LOOP_FWD (0xb) + ALIGNED_LOOP_FWD (0xa) + ALIGNED_LOOP_FWD (0x9) + ALIGNED_LOOP_FWD (0x8) + ALIGNED_LOOP_FWD (0x7) + ALIGNED_LOOP_FWD (0x6) + ALIGNED_LOOP_FWD (0x5) + ALIGNED_LOOP_FWD (0x4) + ALIGNED_LOOP_FWD (0x3) + ALIGNED_LOOP_FWD (0x2) + ALIGNED_LOOP_FWD (0x1) + + .p2align 6 +L(large_loop_fwd_start): +L(large_loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps 64(%rsi), %xmm4 + movaps 80(%rsi), %xmm5 + movntps %xmm1, 16(%rdi) + movntps %xmm2, 32(%rdi) + movntps %xmm3, 48(%rdi) + movntps %xmm4, 64(%rdi) + movntps %xmm5, 80(%rdi) + addq $80, %rdi + addq $80, %rsi + cmpq %rdi, %rcx + ja L(large_loop_fwd_0x0) + + /* Ensure no icache line split on tail. */ + .p2align 4 +L(end_large_loop_fwd): + sfence + movups %xmm11, 16(%rcx) + movups %xmm10, 32(%rcx) + movups %xmm9, 48(%rcx) + movups %xmm8, 64(%rcx) + movups %xmm7, 80(%rcx) + ret + + + /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure + 96-byte spacing between each. */ +#define ALIGNED_LARGE_LOOP_FWD(align_by); \ + .p2align 5; \ +L(large_loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps 64(%rsi), %xmm4; \ + movaps 80(%rsi), %xmm5; \ + movaps %xmm5, %xmm6; \ + palignr $align_by, %xmm4, %xmm5; \ + palignr $align_by, %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm6, %xmm1; \ + movntps %xmm0, 16(%rdi); \ + movntps %xmm2, 32(%rdi); \ + movntps %xmm3, 48(%rdi); \ + movntps %xmm4, 64(%rdi); \ + movntps %xmm5, 80(%rdi); \ + addq $80, %rdi; \ + addq $80, %rsi; \ + cmpq %rdi, %rcx; \ + ja L(large_loop_fwd_ ## align_by); \ + jmp L(end_large_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LARGE_LOOP_FWD (0xf) + ALIGNED_LARGE_LOOP_FWD (0xe) + ALIGNED_LARGE_LOOP_FWD (0xd) + ALIGNED_LARGE_LOOP_FWD (0xc) + ALIGNED_LARGE_LOOP_FWD (0xb) + ALIGNED_LARGE_LOOP_FWD (0xa) + ALIGNED_LARGE_LOOP_FWD (0x9) + ALIGNED_LARGE_LOOP_FWD (0x8) + ALIGNED_LARGE_LOOP_FWD (0x7) + ALIGNED_LARGE_LOOP_FWD (0x6) + ALIGNED_LARGE_LOOP_FWD (0x5) + ALIGNED_LARGE_LOOP_FWD (0x4) + ALIGNED_LARGE_LOOP_FWD (0x3) + ALIGNED_LARGE_LOOP_FWD (0x2) + ALIGNED_LARGE_LOOP_FWD (0x1) + + + .p2align 6 +L(loop_bkwd_start): +L(loop_bkwd_0x0): + movaps 32(%rsi), %xmm1 + movaps 16(%rsi), %xmm2 + movaps 0(%rsi), %xmm3 + movaps %xmm1, 32(%rdi) + movaps %xmm2, 16(%rdi) + movaps %xmm3, 0(%rdi) + subq $48, %rdi + subq $48, %rsi + cmpq %rdi, %r8 + jb L(loop_bkwd_0x0) +L(end_loop_bkwd): + movups %xmm7, -16(%r8, %rdx) + movups %xmm0, 0(%r8) + movups %xmm4, 16(%r8) + movups %xmm5, 32(%r8) + + ret + + + /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_BKWD(align_by); \ + .p2align 6; \ +L(loop_bkwd_ ## align_by): \ + movaps 32(%rsi), %xmm1; \ + movaps 16(%rsi), %xmm2; \ + movaps 0(%rsi), %xmm3; \ + palignr $align_by, %xmm1, %xmm6; \ + palignr $align_by, %xmm2, %xmm1; \ + palignr $align_by, %xmm3, %xmm2; \ + movaps %xmm6, 32(%rdi); \ + movaps %xmm1, 16(%rdi); \ + movaps %xmm2, 0(%rdi); \ + subq $48, %rdi; \ + subq $48, %rsi; \ + movaps %xmm3, %xmm6; \ + cmpq %rdi, %r8; \ + jb L(loop_bkwd_ ## align_by); \ + jmp L(end_loop_bkwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_BKWD (0xf) + ALIGNED_LOOP_BKWD (0xe) + ALIGNED_LOOP_BKWD (0xd) + ALIGNED_LOOP_BKWD (0xc) + ALIGNED_LOOP_BKWD (0xb) + ALIGNED_LOOP_BKWD (0xa) + ALIGNED_LOOP_BKWD (0x9) + ALIGNED_LOOP_BKWD (0x8) + ALIGNED_LOOP_BKWD (0x7) + ALIGNED_LOOP_BKWD (0x6) + ALIGNED_LOOP_BKWD (0x5) + ALIGNED_LOOP_BKWD (0x4) + ALIGNED_LOOP_BKWD (0x3) + ALIGNED_LOOP_BKWD (0x2) + ALIGNED_LOOP_BKWD (0x1) +END(MEMMOVE) + +strong_alias (MEMMOVE, MEMCPY) +strong_alias (MEMMOVE_CHK, MEMCPY_CHK) -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein @ 2022-03-25 20:44 ` Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 4 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 - sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - 5 files changed, 3209 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 48f81711ae..323be3b969 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,14 +16,12 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ memmove-avx512-unaligned-erms \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 70b0e9c62e..d6852ab365 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, @@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned) @@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, @@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512F), __memcpy_avx512_no_vzeroupper) @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, @@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 1ecdd4b0d3..5596ddea2c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -92,13 +92,6 @@ IFUNC_SELECTOR (void) } } - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) - { - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); - } - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) return OPTIMIZE (sse2_unaligned_erms); diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) -L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx -L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy_fwd): - sub $0x80, %rdx -L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx -L(ll_cache_copy_fwd): - add %rcx, %rdx -L(ll_cache_copy_fwd_start): - sub $0x80, %rdx -L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) -L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx -L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy): - sub $0x80, %rdx -L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx -L(ll_cache_copy): - add %rcx, %rdx -L(ll_cache_copy_bwd_start): - sub $0x80, %rdx -L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) -L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -L(fwd_write_0bytes): - ret - - - .p2align 4 -L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -L(bwd_write_0bytes): - ret - - .p2align 4 -L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - - .p2align 4 -L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - - .p2align 3 -L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - - .p2align 3 -L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S deleted file mode 100644 index f9a4e9aff9..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_back -#define MEMCPY_CHK __memmove_chk_ssse3_back -#include "memcpy-ssse3-back.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (2 preceding siblings ...) 2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-03-25 20:44 ` Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 4 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - 5 files changed, 879 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 323be3b969..a2ebc06c5f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -59,7 +59,6 @@ sysdep_routines += \ strcat-evex \ strcat-sse2 \ strcat-sse2-unaligned \ - strcat-ssse3 \ strchr-avx2 \ strchr-avx2-rtm \ strchr-evex \ @@ -97,7 +96,6 @@ sysdep_routines += \ strncat-c \ strncat-evex \ strncat-sse2-unaligned \ - strncat-ssse3 \ strncmp-avx2 \ strncmp-avx2-rtm \ strncmp-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d6852ab365..4133ed7e43 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcat_evex) - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), - __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) @@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncat_evex) - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), - __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 5bece38f78..a15afa44e9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -23,7 +23,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S deleted file mode 100644 index 9f39e4fcd1..0000000000 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ /dev/null @@ -1,866 +0,0 @@ -/* strcat with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_ssse3 -# endif - -# define USE_AS_STRCAT - -.text -ENTRY (STRCAT) -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - -/* Inline corresponding strlen file, temporary until new strcpy - implementation gets merged. */ - - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) - -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(exit) - -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(exit) - -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(exit) - -L(aligned_64_exit_16): - lea -48(%rax), %rax - -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail1): - add $1, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail2): - add $2, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail3): - add $3, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail4): - add $4, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail5): - add $5, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail6): - add $6, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail7): - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail8): - add $8, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail9): - add $9, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail10): - add $10, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail11): - add $11, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail12): - add $12, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail13): - add $13, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail14): - add $14, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail15): - add $15, %eax - - .p2align 4 -L(StartStrcpyPart): - mov %rsi, %rcx - lea (%rdi, %rax), %rdx -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(StrncatExit0) - cmp $8, %r8 - jbe L(StrncatExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) - cmpb $0, 8(%rcx) - jz L(Exit9) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - jb L(StrncatExit15Bytes) -# endif - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) - cmpb $0, 15(%rcx) - jz L(Exit16) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - je L(StrncatExit16) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-ssse3.S" - - .p2align 4 -L(CopyFrom1To16Bytes): - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit1): - xor %ah, %ah - movb %ah, 1(%rdx) -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit2): - xor %ah, %ah - movb %ah, 2(%rdx) -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit3): - xor %ah, %ah - movb %ah, 3(%rdx) -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit4): - xor %ah, %ah - movb %ah, 4(%rdx) -L(Exit4): - mov (%rcx), %eax - mov %eax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit5): - xor %ah, %ah - movb %ah, 5(%rdx) -L(Exit5): - mov (%rcx), %eax - mov %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit6): - xor %ah, %ah - movb %ah, 6(%rdx) -L(Exit6): - mov (%rcx), %eax - mov %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit7): - xor %ah, %ah - movb %ah, 7(%rdx) -L(Exit7): - mov (%rcx), %eax - mov %eax, (%rdx) - mov 3(%rcx), %eax - mov %eax, 3(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8): - xor %ah, %ah - movb %ah, 8(%rdx) -L(Exit8): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit9): - xor %ah, %ah - movb %ah, 9(%rdx) -L(Exit9): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movb 8(%rcx), %al - movb %al, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit10): - xor %ah, %ah - movb %ah, 10(%rdx) -L(Exit10): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movw 8(%rcx), %ax - movw %ax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit11): - xor %ah, %ah - movb %ah, 11(%rdx) -L(Exit11): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit12): - xor %ah, %ah - movb %ah, 12(%rdx) -L(Exit12): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit13): - xor %ah, %ah - movb %ah, 13(%rdx) -L(Exit13): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 5(%rcx), %xmm1 - movlpd %xmm1, 5(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit14): - xor %ah, %ah - movb %ah, 14(%rdx) -L(Exit14): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 6(%rcx), %xmm1 - movlpd %xmm1, 6(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15): - xor %ah, %ah - movb %ah, 15(%rdx) -L(Exit15): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit16): - xor %ah, %ah - movb %ah, 16(%rdx) -L(Exit16): - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - test $0x01, %al - jnz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - test $0x02, %al - jnz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - test $0x04, %al - jnz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - test $0x08, %al - jnz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - test $0x10, %al - jnz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - test $0x20, %al - jnz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - test $0x40, %al - jnz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase2): - test $0x01, %ah - jnz L(Exit9) - cmp $9, %r8 - je L(StrncatExit9) - test $0x02, %ah - jnz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - test $0x04, %ah - jnz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - test $0x8, %ah - jnz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - test $0x10, %ah - jnz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - test $0x20, %ah - jnz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - test $0x40, %ah - jnz L(Exit15) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $8, %r8 - ja L(ExitHighCase3) - cmp $1, %r8 - je L(StrncatExit1) - cmp $2, %r8 - je L(StrncatExit2) - cmp $3, %r8 - je L(StrncatExit3) - cmp $4, %r8 - je L(StrncatExit4) - cmp $5, %r8 - je L(StrncatExit5) - cmp $6, %r8 - je L(StrncatExit6) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - xor %ah, %ah - movb %ah, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase3): - cmp $9, %r8 - je L(StrncatExit9) - cmp $10, %r8 - je L(StrncatExit10) - cmp $11, %r8 - je L(StrncatExit11) - cmp $12, %r8 - je L(StrncatExit12) - cmp $13, %r8 - je L(StrncatExit13) - cmp $14, %r8 - je L(StrncatExit14) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - xor %ah, %ah - movb %ah, 16(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit0): - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15Bytes): - cmp $9, %r8 - je L(StrncatExit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8Bytes): - cmpb $0, (%rcx) - jz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - -# endif -END (STRCAT) -#endif diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S deleted file mode 100644 index 6c45ff3ec7..0000000000 --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCAT -#define STRCAT __strncat_ssse3 -#include "strcat-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (3 preceding siblings ...) 2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-03-25 20:44 ` Noah Goldstein 4 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - 6 files changed, 3572 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index a2ebc06c5f..292353bad7 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -42,13 +42,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -79,7 +77,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -106,7 +103,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 4133ed7e43..505b8002e0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 19:55 ` H.J. Lu 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein ` (6 subsequent siblings) 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- Full memcpy ssse3 results. Number are comparison of geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz bench-memcpy: length, align1, align2, dst > src, New Time / Old Time 1, 0, 0, 0, 2.099 1, 0, 0, 1, 2.099 1, 32, 0, 0, 2.103 1, 32, 0, 1, 2.103 1, 0, 32, 0, 2.099 1, 0, 32, 1, 2.098 1, 32, 32, 0, 2.098 1, 32, 32, 1, 2.098 1, 2048, 0, 0, 2.098 1, 2048, 0, 1, 2.098 2, 0, 0, 0, 1.135 2, 0, 0, 1, 1.136 2, 1, 0, 0, 1.139 2, 1, 0, 1, 1.139 2, 33, 0, 0, 1.165 2, 33, 0, 1, 1.139 2, 0, 1, 0, 1.136 2, 0, 1, 1, 1.136 2, 0, 33, 0, 1.136 2, 0, 33, 1, 1.136 2, 1, 1, 0, 1.136 2, 1, 1, 1, 1.136 2, 33, 33, 0, 1.136 2, 33, 33, 1, 1.136 2, 2048, 0, 0, 1.136 2, 2048, 0, 1, 1.136 2, 2049, 0, 0, 1.191 2, 2049, 0, 1, 1.139 2, 2048, 1, 0, 1.136 2, 2048, 1, 1, 1.136 2, 2049, 1, 0, 1.136 2, 2049, 1, 1, 1.136 4, 0, 0, 0, 1.074 4, 0, 0, 1, 0.962 4, 2, 0, 0, 0.973 4, 2, 0, 1, 0.989 4, 34, 0, 0, 0.991 4, 34, 0, 1, 0.991 4, 0, 2, 0, 0.962 4, 0, 2, 1, 0.962 4, 0, 34, 0, 0.962 4, 0, 34, 1, 0.962 4, 2, 2, 0, 0.962 4, 2, 2, 1, 0.962 4, 34, 34, 0, 0.962 4, 34, 34, 1, 0.962 4, 2048, 0, 0, 0.962 4, 2048, 0, 1, 0.962 4, 2050, 0, 0, 0.977 4, 2050, 0, 1, 0.979 4, 2048, 2, 0, 0.962 4, 2048, 2, 1, 0.962 4, 2050, 2, 0, 0.962 4, 2050, 2, 1, 0.962 8, 0, 0, 0, 0.961 8, 0, 0, 1, 0.962 8, 3, 0, 0, 1.0 8, 3, 0, 1, 1.0 8, 35, 0, 0, 1.0 8, 35, 0, 1, 1.0 8, 0, 3, 0, 0.962 8, 0, 3, 1, 0.962 8, 0, 35, 0, 0.962 8, 0, 35, 1, 0.962 8, 3, 3, 0, 0.962 8, 3, 3, 1, 0.962 8, 35, 35, 0, 0.962 8, 35, 35, 1, 0.962 8, 2048, 0, 0, 0.962 8, 2048, 0, 1, 0.962 8, 2051, 0, 0, 1.0 8, 2051, 0, 1, 1.0 8, 2048, 3, 0, 0.962 8, 2048, 3, 1, 0.962 8, 2051, 3, 0, 0.962 8, 2051, 3, 1, 0.962 16, 0, 0, 0, 0.798 16, 0, 0, 1, 0.799 16, 4, 0, 0, 0.8 16, 4, 0, 1, 0.801 16, 36, 0, 0, 0.801 16, 36, 0, 1, 0.8 16, 0, 4, 0, 0.798 16, 0, 4, 1, 0.798 16, 0, 36, 0, 0.798 16, 0, 36, 1, 0.798 16, 4, 4, 0, 0.798 16, 4, 4, 1, 0.798 16, 36, 36, 0, 0.798 16, 36, 36, 1, 0.798 16, 2048, 0, 0, 0.798 16, 2048, 0, 1, 0.799 16, 2052, 0, 0, 0.8 16, 2052, 0, 1, 0.8 16, 2048, 4, 0, 0.798 16, 2048, 4, 1, 0.798 16, 2052, 4, 0, 0.798 16, 2052, 4, 1, 0.798 32, 0, 0, 0, 0.471 32, 0, 0, 1, 0.471 32, 5, 0, 0, 0.471 32, 5, 0, 1, 0.471 32, 37, 0, 0, 0.961 32, 37, 0, 1, 0.961 32, 0, 5, 0, 0.471 32, 0, 5, 1, 0.471 32, 0, 37, 0, 1.021 32, 0, 37, 1, 1.021 32, 5, 5, 0, 0.471 32, 5, 5, 1, 0.471 32, 37, 37, 0, 1.011 32, 37, 37, 1, 1.011 32, 2048, 0, 0, 0.471 32, 2048, 0, 1, 0.471 32, 2053, 0, 0, 0.471 32, 2053, 0, 1, 0.471 32, 2048, 5, 0, 0.471 32, 2048, 5, 1, 0.471 32, 2053, 5, 0, 0.471 32, 2053, 5, 1, 0.471 64, 0, 0, 0, 1.0 64, 0, 0, 1, 1.0 64, 6, 0, 0, 0.862 64, 6, 0, 1, 0.862 64, 38, 0, 0, 0.912 64, 38, 0, 1, 0.912 64, 0, 6, 0, 0.896 64, 0, 6, 1, 0.896 64, 0, 38, 0, 0.906 64, 0, 38, 1, 0.906 64, 6, 6, 0, 0.91 64, 6, 6, 1, 0.91 64, 38, 38, 0, 0.883 64, 38, 38, 1, 0.883 64, 2048, 0, 0, 1.0 64, 2048, 0, 1, 1.0 64, 2054, 0, 0, 0.862 64, 2054, 0, 1, 0.862 64, 2048, 6, 0, 0.887 64, 2048, 6, 1, 0.887 64, 2054, 6, 0, 0.887 64, 2054, 6, 1, 0.887 128, 0, 0, 0, 0.857 128, 0, 0, 1, 0.857 128, 7, 0, 0, 0.875 128, 7, 0, 1, 0.875 128, 39, 0, 0, 0.892 128, 39, 0, 1, 0.892 128, 0, 7, 0, 1.183 128, 0, 7, 1, 1.183 128, 0, 39, 0, 1.113 128, 0, 39, 1, 1.113 128, 7, 7, 0, 0.692 128, 7, 7, 1, 0.692 128, 39, 39, 0, 1.104 128, 39, 39, 1, 1.104 128, 2048, 0, 0, 0.857 128, 2048, 0, 1, 0.857 128, 2055, 0, 0, 0.875 128, 2055, 0, 1, 0.875 128, 2048, 7, 0, 0.959 128, 2048, 7, 1, 0.959 128, 2055, 7, 0, 1.036 128, 2055, 7, 1, 1.036 256, 0, 0, 0, 0.889 256, 0, 0, 1, 0.889 256, 8, 0, 0, 0.966 256, 8, 0, 1, 0.966 256, 40, 0, 0, 0.983 256, 40, 0, 1, 0.983 256, 0, 8, 0, 1.29 256, 0, 8, 1, 1.29 256, 0, 40, 0, 1.274 256, 0, 40, 1, 1.274 256, 8, 8, 0, 0.865 256, 8, 8, 1, 0.865 256, 40, 40, 0, 1.477 256, 40, 40, 1, 1.477 256, 2048, 0, 0, 0.889 256, 2048, 0, 1, 0.889 256, 2056, 0, 0, 0.966 256, 2056, 0, 1, 0.966 256, 2048, 8, 0, 0.952 256, 2048, 8, 1, 0.952 256, 2056, 8, 0, 0.878 256, 2056, 8, 1, 0.878 512, 0, 0, 0, 1.077 512, 0, 0, 1, 1.077 512, 9, 0, 0, 1.001 512, 9, 0, 1, 1.0 512, 41, 0, 0, 0.954 512, 41, 0, 1, 0.954 512, 0, 9, 0, 1.191 512, 0, 9, 1, 1.191 512, 0, 41, 0, 1.181 512, 0, 41, 1, 1.181 512, 9, 9, 0, 0.765 512, 9, 9, 1, 0.765 512, 41, 41, 0, 0.905 512, 41, 41, 1, 0.905 512, 2048, 0, 0, 1.077 512, 2048, 0, 1, 1.077 512, 2057, 0, 0, 1.0 512, 2057, 0, 1, 1.0 512, 2048, 9, 0, 1.0 512, 2048, 9, 1, 1.0 512, 2057, 9, 0, 0.733 512, 2057, 9, 1, 0.733 1024, 0, 0, 0, 1.143 1024, 0, 0, 1, 1.143 1024, 10, 0, 0, 1.015 1024, 10, 0, 1, 1.015 1024, 42, 0, 0, 1.045 1024, 42, 0, 1, 1.045 1024, 0, 10, 0, 1.126 1024, 0, 10, 1, 1.126 1024, 0, 42, 0, 1.114 1024, 0, 42, 1, 1.114 1024, 10, 10, 0, 0.89 1024, 10, 10, 1, 0.89 1024, 42, 42, 0, 0.986 1024, 42, 42, 1, 0.986 1024, 2048, 0, 0, 1.143 1024, 2048, 0, 1, 1.143 1024, 2058, 0, 0, 1.015 1024, 2058, 0, 1, 1.015 1024, 2048, 10, 0, 1.03 1024, 2048, 10, 1, 1.03 1024, 2058, 10, 0, 0.854 1024, 2058, 10, 1, 0.854 2048, 0, 0, 0, 1.005 2048, 0, 0, 1, 1.005 2048, 11, 0, 0, 1.013 2048, 11, 0, 1, 1.014 2048, 43, 0, 0, 1.044 2048, 43, 0, 1, 1.044 2048, 0, 11, 0, 1.003 2048, 0, 11, 1, 1.003 2048, 0, 43, 0, 1.003 2048, 0, 43, 1, 1.003 2048, 11, 11, 0, 0.92 2048, 11, 11, 1, 0.92 2048, 43, 43, 0, 1.0 2048, 43, 43, 1, 1.0 2048, 2048, 0, 0, 1.005 2048, 2048, 0, 1, 1.005 2048, 2059, 0, 0, 0.904 2048, 2059, 0, 1, 0.904 2048, 2048, 11, 0, 1.0 2048, 2048, 11, 1, 1.0 2048, 2059, 11, 0, 0.979 2048, 2059, 11, 1, 0.979 4096, 0, 0, 0, 1.014 4096, 0, 0, 1, 1.014 4096, 12, 0, 0, 0.855 4096, 12, 0, 1, 0.855 4096, 44, 0, 0, 0.857 4096, 44, 0, 1, 0.857 4096, 0, 12, 0, 0.932 4096, 0, 12, 1, 0.932 4096, 0, 44, 0, 0.932 4096, 0, 44, 1, 0.932 4096, 12, 12, 0, 0.999 4096, 12, 12, 1, 0.999 4096, 44, 44, 0, 1.051 4096, 44, 44, 1, 1.051 4096, 2048, 0, 0, 1.014 4096, 2048, 0, 1, 1.014 4096, 2060, 0, 0, 0.98 4096, 2060, 0, 1, 0.98 4096, 2048, 12, 0, 0.77 4096, 2048, 12, 1, 0.77 4096, 2060, 12, 0, 0.943 4096, 2060, 12, 1, 0.943 8192, 0, 0, 0, 1.046 8192, 0, 0, 1, 1.046 8192, 13, 0, 0, 0.885 8192, 13, 0, 1, 0.885 8192, 45, 0, 0, 0.887 8192, 45, 0, 1, 0.886 8192, 0, 13, 0, 0.942 8192, 0, 13, 1, 0.942 8192, 0, 45, 0, 0.942 8192, 0, 45, 1, 0.942 8192, 13, 13, 0, 1.03 8192, 13, 13, 1, 1.03 8192, 45, 45, 0, 1.048 8192, 45, 45, 1, 1.048 8192, 2048, 0, 0, 1.048 8192, 2048, 0, 1, 1.048 8192, 2061, 0, 0, 1.011 8192, 2061, 0, 1, 1.011 8192, 2048, 13, 0, 0.789 8192, 2048, 13, 1, 0.789 8192, 2061, 13, 0, 0.991 8192, 2061, 13, 1, 0.991 16384, 0, 0, 0, 1.014 16384, 0, 0, 1, 1.008 16384, 14, 0, 0, 0.951 16384, 14, 0, 1, 0.95 16384, 46, 0, 0, 0.874 16384, 46, 0, 1, 0.871 16384, 0, 14, 0, 0.813 16384, 0, 14, 1, 0.81 16384, 0, 46, 0, 0.85 16384, 0, 46, 1, 0.86 16384, 14, 14, 0, 0.985 16384, 14, 14, 1, 0.975 16384, 46, 46, 0, 1.025 16384, 46, 46, 1, 1.027 16384, 2048, 0, 0, 1.058 16384, 2048, 0, 1, 1.058 16384, 2062, 0, 0, 0.849 16384, 2062, 0, 1, 0.848 16384, 2048, 14, 0, 0.907 16384, 2048, 14, 1, 0.907 16384, 2062, 14, 0, 0.988 16384, 2062, 14, 1, 0.995 32768, 0, 0, 0, 0.979 32768, 0, 0, 1, 0.979 32768, 15, 0, 0, 1.006 32768, 15, 0, 1, 1.006 32768, 47, 0, 0, 1.004 32768, 47, 0, 1, 1.004 32768, 0, 15, 0, 1.045 32768, 0, 15, 1, 1.045 32768, 0, 47, 0, 1.011 32768, 0, 47, 1, 1.012 32768, 15, 15, 0, 0.977 32768, 15, 15, 1, 0.977 32768, 47, 47, 0, 0.96 32768, 47, 47, 1, 0.96 32768, 2048, 0, 0, 0.978 32768, 2048, 0, 1, 0.978 32768, 2063, 0, 0, 1.004 32768, 2063, 0, 1, 1.004 32768, 2048, 15, 0, 1.036 32768, 2048, 15, 1, 1.036 32768, 2063, 15, 0, 0.978 32768, 2063, 15, 1, 0.978 65536, 0, 0, 0, 0.981 65536, 0, 0, 1, 0.981 65536, 16, 0, 0, 0.987 65536, 16, 0, 1, 0.987 65536, 48, 0, 0, 0.968 65536, 48, 0, 1, 0.968 65536, 0, 16, 0, 1.014 65536, 0, 16, 1, 1.014 65536, 0, 48, 0, 0.984 65536, 0, 48, 1, 0.984 65536, 16, 16, 0, 1.01 65536, 16, 16, 1, 1.01 65536, 48, 48, 0, 0.968 65536, 48, 48, 1, 0.968 65536, 2048, 0, 0, 0.982 65536, 2048, 0, 1, 0.982 65536, 2064, 0, 0, 0.987 65536, 2064, 0, 1, 0.987 65536, 2048, 16, 0, 1.012 65536, 2048, 16, 1, 1.012 65536, 2064, 16, 0, 1.007 65536, 2064, 16, 1, 1.007 0, 0, 0, 0, 2.104 0, 2048, 0, 0, 2.104 0, 4095, 0, 0, 2.109 0, 0, 4095, 0, 2.103 1, 1, 0, 0, 2.104 1, 0, 1, 0, 2.098 1, 1, 1, 0, 2.098 1, 2049, 0, 0, 2.102 1, 2048, 1, 0, 2.098 1, 2049, 1, 0, 2.098 1, 4095, 0, 0, 2.103 1, 0, 4095, 0, 2.098 2, 2, 0, 0, 1.139 2, 0, 2, 0, 1.136 2, 2, 2, 0, 1.136 2, 2050, 0, 0, 1.139 2, 2048, 2, 0, 1.136 2, 2050, 2, 0, 1.136 2, 4095, 0, 0, 1.0 2, 0, 4095, 0, 1.022 3, 0, 0, 0, 0.981 3, 3, 0, 0, 0.984 3, 0, 3, 0, 0.982 3, 3, 3, 0, 0.982 3, 2048, 0, 0, 0.982 3, 2051, 0, 0, 0.983 3, 2048, 3, 0, 0.982 3, 2051, 3, 0, 0.982 3, 4095, 0, 0, 0.285 3, 0, 4095, 0, 0.231 4, 4, 0, 0, 1.373 4, 0, 4, 0, 1.31 4, 4, 4, 0, 1.282 4, 2052, 0, 0, 1.264 4, 2048, 4, 0, 1.254 4, 2052, 4, 0, 1.254 4, 4095, 0, 0, 1.971 4, 0, 4095, 0, 1.994 5, 0, 0, 0, 1.145 5, 5, 0, 0, 1.155 5, 0, 5, 0, 1.171 5, 5, 5, 0, 1.171 5, 2048, 0, 0, 1.197 5, 2053, 0, 0, 1.173 5, 2048, 5, 0, 1.171 5, 2053, 5, 0, 1.171 5, 4095, 0, 0, 0.935 5, 0, 4095, 0, 1.017 6, 0, 0, 0, 1.145 6, 6, 0, 0, 1.098 6, 0, 6, 0, 1.096 6, 6, 6, 0, 1.096 6, 2048, 0, 0, 1.12 6, 2054, 0, 0, 1.122 6, 2048, 6, 0, 1.12 6, 2054, 6, 0, 1.096 6, 4095, 0, 0, 0.935 6, 0, 4095, 0, 1.018 7, 0, 0, 0, 1.071 7, 7, 0, 0, 1.074 7, 0, 7, 0, 1.072 7, 7, 7, 0, 1.072 7, 2048, 0, 0, 1.096 7, 2055, 0, 0, 1.098 7, 2048, 7, 0, 1.096 7, 2055, 7, 0, 1.096 7, 4095, 0, 0, 0.935 7, 0, 4095, 0, 1.016 8, 8, 0, 0, 1.167 8, 0, 8, 0, 1.028 8, 8, 8, 0, 1.028 8, 2056, 0, 0, 1.069 8, 2048, 8, 0, 1.028 8, 2056, 8, 0, 1.028 8, 4095, 0, 0, 1.029 8, 0, 4095, 0, 1.043 9, 0, 0, 0, 0.799 9, 9, 0, 0, 0.801 9, 0, 9, 0, 0.799 9, 9, 9, 0, 0.799 9, 2048, 0, 0, 0.8 9, 2057, 0, 0, 0.801 9, 2048, 9, 0, 0.8 9, 2057, 9, 0, 0.799 9, 4095, 0, 0, 0.909 9, 0, 4095, 0, 1.0 10, 0, 0, 0, 0.799 10, 10, 0, 0, 0.801 10, 0, 10, 0, 0.8 10, 10, 10, 0, 0.8 10, 2048, 0, 0, 0.8 10, 2058, 0, 0, 0.801 10, 2048, 10, 0, 0.8 10, 2058, 10, 0, 0.8 10, 4095, 0, 0, 0.909 10, 0, 4095, 0, 1.0 11, 0, 0, 0, 0.799 11, 11, 0, 0, 0.801 11, 0, 11, 0, 0.8 11, 11, 11, 0, 0.8 11, 2048, 0, 0, 0.8 11, 2059, 0, 0, 0.802 11, 2048, 11, 0, 0.8 11, 2059, 11, 0, 0.8 11, 4095, 0, 0, 0.909 11, 0, 4095, 0, 1.0 12, 0, 0, 0, 0.799 12, 12, 0, 0, 0.801 12, 0, 12, 0, 0.8 12, 12, 12, 0, 0.8 12, 2048, 0, 0, 0.8 12, 2060, 0, 0, 0.802 12, 2048, 12, 0, 0.8 12, 2060, 12, 0, 0.8 12, 4095, 0, 0, 0.909 12, 0, 4095, 0, 1.0 13, 0, 0, 0, 0.798 13, 13, 0, 0, 0.801 13, 0, 13, 0, 0.799 13, 13, 13, 0, 0.799 13, 2048, 0, 0, 0.8 13, 2061, 0, 0, 0.801 13, 2048, 13, 0, 0.8 13, 2061, 13, 0, 0.8 13, 4095, 0, 0, 0.909 13, 0, 4095, 0, 1.0 14, 0, 0, 0, 0.799 14, 14, 0, 0, 0.801 14, 0, 14, 0, 0.8 14, 14, 14, 0, 0.8 14, 2048, 0, 0, 0.8 14, 2062, 0, 0, 0.801 14, 2048, 14, 0, 0.8 14, 2062, 14, 0, 0.8 14, 4095, 0, 0, 0.909 14, 0, 4095, 0, 1.0 15, 0, 0, 0, 0.799 15, 15, 0, 0, 0.801 15, 0, 15, 0, 0.8 15, 15, 15, 0, 0.8 15, 2048, 0, 0, 0.8 15, 2063, 0, 0, 0.802 15, 2048, 15, 0, 0.8 15, 2063, 15, 0, 0.8 15, 4095, 0, 0, 0.909 15, 0, 4095, 0, 1.0 16, 16, 0, 0, 0.801 16, 0, 16, 0, 0.799 16, 16, 16, 0, 0.799 16, 2064, 0, 0, 0.801 16, 2048, 16, 0, 0.798 16, 2064, 16, 0, 0.798 16, 4095, 0, 0, 1.818 16, 0, 4095, 0, 1.957 17, 0, 0, 0, 0.798 17, 17, 0, 0, 0.8 17, 0, 17, 0, 0.799 17, 17, 17, 0, 0.798 17, 2048, 0, 0, 0.798 17, 2065, 0, 0, 0.8 17, 2048, 17, 0, 0.798 17, 2065, 17, 0, 0.799 17, 4095, 0, 0, 0.937 17, 0, 4095, 0, 1.021 18, 0, 0, 0, 0.798 18, 18, 0, 0, 0.801 18, 0, 18, 0, 0.798 18, 18, 18, 0, 0.798 18, 2048, 0, 0, 0.799 18, 2066, 0, 0, 0.8 18, 2048, 18, 0, 0.798 18, 2066, 18, 0, 0.798 18, 4095, 0, 0, 0.937 18, 0, 4095, 0, 1.021 19, 0, 0, 0, 0.798 19, 19, 0, 0, 0.8 19, 0, 19, 0, 0.798 19, 19, 19, 0, 0.798 19, 2048, 0, 0, 0.798 19, 2067, 0, 0, 0.8 19, 2048, 19, 0, 0.798 19, 2067, 19, 0, 0.798 19, 4095, 0, 0, 0.937 19, 0, 4095, 0, 1.021 20, 0, 0, 0, 0.798 20, 20, 0, 0, 0.8 20, 0, 20, 0, 0.798 20, 20, 20, 0, 0.798 20, 2048, 0, 0, 0.798 20, 2068, 0, 0, 0.8 20, 2048, 20, 0, 0.798 20, 2068, 20, 0, 0.798 20, 4095, 0, 0, 0.937 20, 0, 4095, 0, 1.021 21, 0, 0, 0, 0.798 21, 21, 0, 0, 0.801 21, 0, 21, 0, 0.798 21, 21, 21, 0, 0.798 21, 2048, 0, 0, 0.798 21, 2069, 0, 0, 0.801 21, 2048, 21, 0, 0.799 21, 2069, 21, 0, 0.798 21, 4095, 0, 0, 0.937 21, 0, 4095, 0, 1.021 22, 0, 0, 0, 0.798 22, 22, 0, 0, 0.801 22, 0, 22, 0, 0.798 22, 22, 22, 0, 0.798 22, 2048, 0, 0, 0.798 22, 2070, 0, 0, 0.801 22, 2048, 22, 0, 0.798 22, 2070, 22, 0, 0.798 22, 4095, 0, 0, 0.937 22, 0, 4095, 0, 1.021 23, 0, 0, 0, 0.798 23, 23, 0, 0, 0.8 23, 0, 23, 0, 0.798 23, 23, 23, 0, 0.798 23, 2048, 0, 0, 0.798 23, 2071, 0, 0, 0.8 23, 2048, 23, 0, 0.798 23, 2071, 23, 0, 0.798 23, 4095, 0, 0, 0.937 23, 0, 4095, 0, 1.021 24, 0, 0, 0, 0.798 24, 24, 0, 0, 0.8 24, 0, 24, 0, 0.799 24, 24, 24, 0, 0.798 24, 2048, 0, 0, 0.798 24, 2072, 0, 0, 0.801 24, 2048, 24, 0, 0.798 24, 2072, 24, 0, 0.798 24, 4095, 0, 0, 0.937 24, 0, 4095, 0, 1.021 25, 0, 0, 0, 0.5 25, 25, 0, 0, 0.5 25, 0, 25, 0, 0.5 25, 25, 25, 0, 0.5 25, 2048, 0, 0, 0.5 25, 2073, 0, 0, 0.501 25, 2048, 25, 0, 0.5 25, 2073, 25, 0, 0.5 25, 4095, 0, 0, 0.974 25, 0, 4095, 0, 0.98 26, 0, 0, 0, 0.5 26, 26, 0, 0, 0.501 26, 0, 26, 0, 0.5 26, 26, 26, 0, 0.501 26, 2048, 0, 0, 0.5 26, 2074, 0, 0, 0.5 26, 2048, 26, 0, 0.5 26, 2074, 26, 0, 0.5 26, 4095, 0, 0, 0.974 26, 0, 4095, 0, 1.0 27, 0, 0, 0, 0.5 27, 27, 0, 0, 0.501 27, 0, 27, 0, 0.5 27, 27, 27, 0, 0.5 27, 2048, 0, 0, 0.5 27, 2075, 0, 0, 0.5 27, 2048, 27, 0, 0.5 27, 2075, 27, 0, 0.5 27, 4095, 0, 0, 0.974 27, 0, 4095, 0, 1.0 28, 0, 0, 0, 0.5 28, 28, 0, 0, 0.501 28, 0, 28, 0, 0.5 28, 28, 28, 0, 0.5 28, 2048, 0, 0, 0.5 28, 2076, 0, 0, 0.5 28, 2048, 28, 0, 0.5 28, 2076, 28, 0, 0.5 28, 4095, 0, 0, 0.974 28, 0, 4095, 0, 1.0 29, 0, 0, 0, 0.471 29, 29, 0, 0, 0.471 29, 0, 29, 0, 0.471 29, 29, 29, 0, 0.471 29, 2048, 0, 0, 0.471 29, 2077, 0, 0, 0.471 29, 2048, 29, 0, 0.471 29, 2077, 29, 0, 0.471 29, 4095, 0, 0, 0.974 29, 0, 4095, 0, 1.0 30, 0, 0, 0, 0.471 30, 30, 0, 0, 0.471 30, 0, 30, 0, 0.471 30, 30, 30, 0, 0.471 30, 2048, 0, 0, 0.471 30, 2078, 0, 0, 0.471 30, 2048, 30, 0, 0.471 30, 2078, 30, 0, 0.471 30, 4095, 0, 0, 0.974 30, 0, 4095, 0, 1.0 31, 0, 0, 0, 0.471 31, 31, 0, 0, 0.471 31, 0, 31, 0, 0.471 31, 31, 31, 0, 0.471 31, 2048, 0, 0, 0.471 31, 2079, 0, 0, 0.471 31, 2048, 31, 0, 0.471 31, 2079, 31, 0, 0.471 31, 4095, 0, 0, 0.974 31, 0, 4095, 0, 1.0 48, 0, 0, 0, 1.0 48, 0, 0, 1, 1.0 48, 3, 0, 0, 1.0 48, 3, 0, 1, 1.0 48, 0, 3, 0, 1.0 48, 0, 3, 1, 1.0 48, 3, 3, 0, 1.0 48, 3, 3, 1, 1.0 48, 2048, 0, 0, 1.0 48, 2048, 0, 1, 1.0 48, 2051, 0, 0, 1.0 48, 2051, 0, 1, 1.0 48, 2048, 3, 0, 1.0 48, 2048, 3, 1, 1.0 48, 2051, 3, 0, 1.0 48, 2051, 3, 1, 1.0 80, 0, 0, 0, 0.781 80, 0, 0, 1, 0.782 80, 5, 0, 0, 0.976 80, 5, 0, 1, 0.976 80, 0, 5, 0, 1.232 80, 0, 5, 1, 1.232 80, 5, 5, 0, 1.542 80, 5, 5, 1, 1.543 80, 2048, 0, 0, 0.781 80, 2048, 0, 1, 0.782 80, 2053, 0, 0, 0.976 80, 2053, 0, 1, 0.976 80, 2048, 5, 0, 1.093 80, 2048, 5, 1, 1.093 80, 2053, 5, 0, 1.371 80, 2053, 5, 1, 1.371 96, 0, 0, 0, 0.758 96, 0, 0, 1, 0.758 96, 6, 0, 0, 0.929 96, 6, 0, 1, 0.929 96, 0, 6, 0, 1.204 96, 0, 6, 1, 1.204 96, 6, 6, 0, 1.562 96, 6, 6, 1, 1.562 96, 2048, 0, 0, 0.758 96, 2048, 0, 1, 0.758 96, 2054, 0, 0, 0.929 96, 2054, 0, 1, 0.929 96, 2048, 6, 0, 1.068 96, 2048, 6, 1, 1.068 96, 2054, 6, 0, 1.562 96, 2054, 6, 1, 1.562 112, 0, 0, 0, 0.736 112, 0, 0, 1, 0.736 112, 7, 0, 0, 0.675 112, 7, 0, 1, 0.675 112, 0, 7, 0, 0.778 112, 0, 7, 1, 0.778 112, 7, 7, 0, 0.909 112, 7, 7, 1, 0.909 112, 2048, 0, 0, 0.736 112, 2048, 0, 1, 0.736 112, 2055, 0, 0, 0.675 112, 2055, 0, 1, 0.675 112, 2048, 7, 0, 0.778 112, 2048, 7, 1, 0.778 112, 2055, 7, 0, 0.909 112, 2055, 7, 1, 0.909 144, 0, 0, 0, 0.857 144, 0, 0, 1, 0.857 144, 9, 0, 0, 0.941 144, 9, 0, 1, 0.943 144, 0, 9, 0, 1.137 144, 0, 9, 1, 1.137 144, 9, 9, 0, 1.514 144, 9, 9, 1, 1.514 144, 2048, 0, 0, 0.857 144, 2048, 0, 1, 0.857 144, 2057, 0, 0, 0.939 144, 2057, 0, 1, 0.945 144, 2048, 9, 0, 0.922 144, 2048, 9, 1, 0.922 144, 2057, 9, 0, 1.514 144, 2057, 9, 1, 1.514 160, 0, 0, 0, 0.698 160, 0, 0, 1, 0.698 160, 10, 0, 0, 0.91 160, 10, 0, 1, 0.91 160, 0, 10, 0, 1.211 160, 0, 10, 1, 1.212 160, 10, 10, 0, 1.357 160, 10, 10, 1, 1.357 160, 2048, 0, 0, 0.698 160, 2048, 0, 1, 0.698 160, 2058, 0, 0, 0.91 160, 2058, 0, 1, 0.91 160, 2048, 10, 0, 0.923 160, 2048, 10, 1, 0.923 160, 2058, 10, 0, 1.357 160, 2058, 10, 1, 1.357 176, 0, 0, 0, 0.796 176, 0, 0, 1, 0.796 176, 11, 0, 0, 0.804 176, 11, 0, 1, 0.804 176, 0, 11, 0, 0.774 176, 0, 11, 1, 0.774 176, 11, 11, 0, 0.814 176, 11, 11, 1, 0.814 176, 2048, 0, 0, 0.796 176, 2048, 0, 1, 0.796 176, 2059, 0, 0, 0.804 176, 2059, 0, 1, 0.804 176, 2048, 11, 0, 0.774 176, 2048, 11, 1, 0.774 176, 2059, 11, 0, 0.814 176, 2059, 11, 1, 0.814 192, 0, 0, 0, 0.778 192, 0, 0, 1, 0.778 192, 12, 0, 0, 0.881 192, 12, 0, 1, 0.881 192, 0, 12, 0, 1.167 192, 0, 12, 1, 1.167 192, 12, 12, 0, 0.841 192, 12, 12, 1, 0.841 192, 2048, 0, 0, 0.778 192, 2048, 0, 1, 0.778 192, 2060, 0, 0, 0.881 192, 2060, 0, 1, 0.881 192, 2048, 12, 0, 0.889 192, 2048, 12, 1, 0.889 192, 2060, 12, 0, 0.906 192, 2060, 12, 1, 0.906 208, 0, 0, 0, 0.833 208, 0, 0, 1, 0.833 208, 13, 0, 0, 0.921 208, 13, 0, 1, 0.921 208, 0, 13, 0, 0.835 208, 0, 13, 1, 0.833 208, 13, 13, 0, 1.333 208, 13, 13, 1, 1.333 208, 2048, 0, 0, 0.833 208, 2048, 0, 1, 0.833 208, 2061, 0, 0, 0.921 208, 2061, 0, 1, 0.921 208, 2048, 13, 0, 0.833 208, 2048, 13, 1, 0.833 208, 2061, 13, 0, 1.333 208, 2061, 13, 1, 1.333 224, 0, 0, 0, 0.93 224, 0, 0, 1, 0.93 224, 14, 0, 0, 1.0 224, 14, 0, 1, 1.0 224, 0, 14, 0, 1.15 224, 0, 14, 1, 1.15 224, 14, 14, 0, 1.452 224, 14, 14, 1, 1.452 224, 2048, 0, 0, 0.93 224, 2048, 0, 1, 0.93 224, 2062, 0, 0, 1.0 224, 2062, 0, 1, 1.0 224, 2048, 14, 0, 0.833 224, 2048, 14, 1, 0.833 224, 2062, 14, 0, 1.452 224, 2062, 14, 1, 1.452 240, 0, 0, 0, 0.909 240, 0, 0, 1, 0.909 240, 15, 0, 0, 0.797 240, 15, 0, 1, 0.797 240, 0, 15, 0, 0.771 240, 0, 15, 1, 0.771 240, 15, 15, 0, 0.93 240, 15, 15, 1, 0.93 240, 2048, 0, 0, 0.909 240, 2048, 0, 1, 0.909 240, 2063, 0, 0, 0.797 240, 2063, 0, 1, 0.797 240, 2048, 15, 0, 0.771 240, 2048, 15, 1, 0.771 240, 2063, 15, 0, 0.93 240, 2063, 15, 1, 0.93 272, 0, 0, 0, 0.9 272, 0, 0, 1, 0.9 272, 17, 0, 0, 1.015 272, 17, 0, 1, 1.015 272, 0, 17, 0, 0.926 272, 0, 17, 1, 0.927 272, 17, 17, 0, 0.892 272, 17, 17, 1, 0.892 272, 2048, 0, 0, 0.9 272, 2048, 0, 1, 0.9 272, 2065, 0, 0, 1.015 272, 2065, 0, 1, 1.015 272, 2048, 17, 0, 0.927 272, 2048, 17, 1, 0.927 272, 2065, 17, 0, 0.878 272, 2065, 17, 1, 0.878 288, 0, 0, 0, 0.882 288, 0, 0, 1, 0.882 288, 18, 0, 0, 0.803 288, 18, 0, 1, 0.803 288, 0, 18, 0, 0.768 288, 0, 18, 1, 0.768 288, 18, 18, 0, 0.882 288, 18, 18, 1, 0.882 288, 2048, 0, 0, 0.882 288, 2048, 0, 1, 0.882 288, 2066, 0, 0, 0.803 288, 2066, 0, 1, 0.803 288, 2048, 18, 0, 0.768 288, 2048, 18, 1, 0.768 288, 2066, 18, 0, 0.882 288, 2066, 18, 1, 0.882 304, 0, 0, 0, 0.865 304, 0, 0, 1, 0.865 304, 19, 0, 0, 0.944 304, 19, 0, 1, 0.944 304, 0, 19, 0, 0.943 304, 0, 19, 1, 0.943 304, 19, 19, 0, 0.956 304, 19, 19, 1, 0.956 304, 2048, 0, 0, 0.866 304, 2048, 0, 1, 0.865 304, 2067, 0, 0, 0.944 304, 2067, 0, 1, 0.944 304, 2048, 19, 0, 0.943 304, 2048, 19, 1, 0.943 304, 2067, 19, 0, 0.947 304, 2067, 19, 1, 0.947 320, 0, 0, 0, 0.944 320, 0, 0, 1, 0.944 320, 20, 0, 0, 0.962 320, 20, 0, 1, 0.962 320, 0, 20, 0, 1.214 320, 0, 20, 1, 1.214 320, 20, 20, 0, 1.365 320, 20, 20, 1, 1.365 320, 2048, 0, 0, 0.943 320, 2048, 0, 1, 0.943 320, 2068, 0, 0, 0.962 320, 2068, 0, 1, 0.962 320, 2048, 20, 0, 0.914 320, 2048, 20, 1, 0.914 320, 2068, 20, 0, 1.365 320, 2068, 20, 1, 1.365 336, 0, 0, 0, 1.0 336, 0, 0, 1, 1.0 336, 21, 0, 0, 0.986 336, 21, 0, 1, 0.986 336, 0, 21, 0, 0.853 336, 0, 21, 1, 0.853 336, 21, 21, 0, 0.843 336, 21, 21, 1, 0.843 336, 2048, 0, 0, 1.0 336, 2048, 0, 1, 1.0 336, 2069, 0, 0, 0.986 336, 2069, 0, 1, 0.986 336, 2048, 21, 0, 0.853 336, 2048, 21, 1, 0.853 336, 2069, 21, 0, 0.831 336, 2069, 21, 1, 0.831 352, 0, 0, 0, 0.98 352, 0, 0, 1, 0.98 352, 22, 0, 0, 0.811 352, 22, 0, 1, 0.811 352, 0, 22, 0, 0.882 352, 0, 22, 1, 0.882 352, 22, 22, 0, 1.1 352, 22, 22, 1, 1.1 352, 2048, 0, 0, 0.98 352, 2048, 0, 1, 0.98 352, 2070, 0, 0, 0.811 352, 2070, 0, 1, 0.811 352, 2048, 22, 0, 0.882 352, 2048, 22, 1, 0.882 352, 2070, 22, 0, 1.1 352, 2070, 22, 1, 1.1 368, 0, 0, 0, 1.058 368, 0, 0, 1, 1.058 368, 23, 0, 0, 1.0 368, 23, 0, 1, 1.0 368, 0, 23, 0, 0.948 368, 0, 23, 1, 0.948 368, 23, 23, 0, 0.723 368, 23, 23, 1, 0.723 368, 2048, 0, 0, 1.058 368, 2048, 0, 1, 1.058 368, 2071, 0, 0, 1.0 368, 2071, 0, 1, 1.0 368, 2048, 23, 0, 0.948 368, 2048, 23, 1, 0.948 368, 2071, 23, 0, 0.701 368, 2071, 23, 1, 0.701 384, 0, 0, 0, 1.012 384, 0, 0, 1, 1.012 384, 24, 0, 0, 1.04 384, 24, 0, 1, 1.04 384, 0, 24, 0, 1.154 384, 0, 24, 1, 1.154 384, 24, 24, 0, 1.423 384, 24, 24, 1, 1.423 384, 2048, 0, 0, 1.012 384, 2048, 0, 1, 1.012 384, 2072, 0, 0, 1.04 384, 2072, 0, 1, 1.04 384, 2048, 24, 0, 0.91 384, 2048, 24, 1, 0.91 384, 2072, 24, 0, 1.423 384, 2072, 24, 1, 1.423 400, 0, 0, 0, 0.948 400, 0, 0, 1, 0.948 400, 25, 0, 0, 0.957 400, 25, 0, 1, 0.957 400, 0, 25, 0, 1.099 400, 0, 25, 1, 1.069 400, 25, 25, 0, 0.885 400, 25, 25, 1, 0.885 400, 2048, 0, 0, 0.948 400, 2048, 0, 1, 0.948 400, 2073, 0, 0, 0.957 400, 2073, 0, 1, 0.957 400, 2048, 25, 0, 0.94 400, 2048, 25, 1, 0.94 400, 2073, 25, 0, 0.908 400, 2073, 25, 1, 0.908 416, 0, 0, 0, 1.017 416, 0, 0, 1, 1.017 416, 26, 0, 0, 0.903 416, 26, 0, 1, 0.903 416, 0, 26, 0, 0.881 416, 0, 26, 1, 0.881 416, 26, 26, 0, 1.035 416, 26, 26, 1, 1.035 416, 2048, 0, 0, 1.017 416, 2048, 0, 1, 1.017 416, 2074, 0, 0, 0.903 416, 2074, 0, 1, 0.903 416, 2048, 26, 0, 0.881 416, 2048, 26, 1, 0.881 416, 2074, 26, 0, 1.034 416, 2074, 26, 1, 1.035 432, 0, 0, 0, 1.0 432, 0, 0, 1, 1.0 432, 27, 0, 0, 0.933 432, 27, 0, 1, 0.933 432, 0, 27, 0, 0.941 432, 0, 27, 1, 0.941 432, 27, 27, 0, 0.953 432, 27, 27, 1, 0.954 432, 2048, 0, 0, 1.0 432, 2048, 0, 1, 1.0 432, 2075, 0, 0, 0.933 432, 2075, 0, 1, 0.933 432, 2048, 27, 0, 0.941 432, 2048, 27, 1, 0.941 432, 2075, 27, 0, 0.93 432, 2075, 27, 1, 0.93 448, 0, 0, 0, 0.984 448, 0, 0, 1, 0.984 448, 28, 0, 0, 0.896 448, 28, 0, 1, 0.896 448, 0, 28, 0, 1.244 448, 0, 28, 1, 1.244 448, 28, 28, 0, 1.333 448, 28, 28, 1, 1.333 448, 2048, 0, 0, 0.984 448, 2048, 0, 1, 0.984 448, 2076, 0, 0, 0.896 448, 2076, 0, 1, 0.896 448, 2048, 28, 0, 0.988 448, 2048, 28, 1, 0.988 448, 2076, 28, 0, 1.333 448, 2076, 28, 1, 1.333 464, 0, 0, 0, 1.083 464, 0, 0, 1, 1.083 464, 29, 0, 0, 0.978 464, 29, 0, 1, 0.978 464, 0, 29, 0, 0.924 464, 0, 29, 1, 0.924 464, 29, 29, 0, 0.901 464, 29, 29, 1, 0.901 464, 2048, 0, 0, 1.083 464, 2048, 0, 1, 1.083 464, 2077, 0, 0, 0.978 464, 2077, 0, 1, 0.978 464, 2048, 29, 0, 0.924 464, 2048, 29, 1, 0.924 464, 2077, 29, 0, 0.89 464, 2077, 29, 1, 0.89 480, 0, 0, 0, 1.066 480, 0, 0, 1, 1.066 480, 30, 0, 0, 0.9 480, 30, 0, 1, 0.9 480, 0, 30, 0, 0.88 480, 0, 30, 1, 0.88 480, 30, 30, 0, 1.083 480, 30, 30, 1, 1.083 480, 2048, 0, 0, 1.066 480, 2048, 0, 1, 1.066 480, 2078, 0, 0, 0.9 480, 2078, 0, 1, 0.9 480, 2048, 30, 0, 0.88 480, 2048, 30, 1, 0.88 480, 2078, 30, 0, 1.083 480, 2078, 30, 1, 1.083 496, 0, 0, 0, 1.032 496, 0, 0, 1, 1.032 496, 31, 0, 0, 0.95 496, 31, 0, 1, 0.95 496, 0, 31, 0, 1.011 496, 0, 31, 1, 1.011 496, 31, 31, 0, 0.973 496, 31, 31, 1, 0.973 496, 2048, 0, 0, 1.032 496, 2048, 0, 1, 1.032 496, 2079, 0, 0, 0.95 496, 2079, 0, 1, 0.95 496, 2048, 31, 0, 1.011 496, 2048, 31, 1, 1.011 496, 2079, 31, 0, 0.941 496, 2079, 31, 1, 0.941 1024, 32, 0, 0, 1.143 1024, 32, 0, 1, 1.143 1024, 0, 32, 0, 1.143 1024, 0, 32, 1, 1.143 1024, 32, 32, 0, 1.143 1024, 32, 32, 1, 1.143 1024, 2080, 0, 0, 1.143 1024, 2080, 0, 1, 1.143 1024, 2048, 32, 0, 1.143 1024, 2048, 32, 1, 1.143 1024, 2080, 32, 0, 1.143 1024, 2080, 32, 1, 1.143 1056, 0, 0, 0, 1.168 1056, 0, 0, 1, 1.168 1056, 33, 0, 0, 1.067 1056, 33, 0, 1, 1.067 1056, 0, 33, 0, 0.977 1056, 0, 33, 1, 0.977 1056, 33, 33, 0, 1.043 1056, 33, 33, 1, 1.043 1056, 2048, 0, 0, 1.168 1056, 2048, 0, 1, 1.168 1056, 2081, 0, 0, 1.067 1056, 2081, 0, 1, 1.067 1056, 2048, 33, 0, 0.977 1056, 2048, 33, 1, 0.977 1056, 2081, 33, 0, 1.0 1056, 2081, 33, 1, 1.0 1088, 0, 0, 0, 1.171 1088, 0, 0, 1, 1.171 1088, 34, 0, 0, 1.041 1088, 34, 0, 1, 1.041 1088, 0, 34, 0, 1.079 1088, 0, 34, 1, 1.079 1088, 34, 34, 0, 0.966 1088, 34, 34, 1, 0.966 1088, 2048, 0, 0, 1.171 1088, 2048, 0, 1, 1.171 1088, 2082, 0, 0, 1.041 1088, 2082, 0, 1, 1.041 1088, 2048, 34, 0, 0.994 1088, 2048, 34, 1, 0.994 1088, 2082, 34, 0, 0.966 1088, 2082, 34, 1, 0.966 1120, 0, 0, 0, 1.152 1120, 0, 0, 1, 1.153 1120, 35, 0, 0, 1.051 1120, 35, 0, 1, 1.051 1120, 0, 35, 0, 1.0 1120, 0, 35, 1, 1.0 1120, 35, 35, 0, 1.068 1120, 35, 35, 1, 1.068 1120, 2048, 0, 0, 1.151 1120, 2048, 0, 1, 1.151 1120, 2083, 0, 0, 1.051 1120, 2083, 0, 1, 1.051 1120, 2048, 35, 0, 1.0 1120, 2048, 35, 1, 1.0 1120, 2083, 35, 0, 1.027 1120, 2083, 35, 1, 1.027 1152, 0, 0, 0, 1.159 1152, 0, 0, 1, 1.159 1152, 36, 0, 0, 1.034 1152, 36, 0, 1, 1.034 1152, 0, 36, 0, 1.07 1152, 0, 36, 1, 1.07 1152, 36, 36, 0, 0.967 1152, 36, 36, 1, 0.967 1152, 2048, 0, 0, 1.159 1152, 2048, 0, 1, 1.159 1152, 2084, 0, 0, 1.034 1152, 2084, 0, 1, 1.034 1152, 2048, 36, 0, 0.984 1152, 2048, 36, 1, 0.984 1152, 2084, 36, 0, 0.967 1152, 2084, 36, 1, 0.967 1184, 0, 0, 0, 1.157 1184, 0, 0, 1, 1.157 1184, 37, 0, 0, 1.067 1184, 37, 0, 1, 1.066 1184, 0, 37, 0, 0.993 1184, 0, 37, 1, 0.993 1184, 37, 37, 0, 1.08 1184, 37, 37, 1, 1.081 1184, 2048, 0, 0, 1.157 1184, 2048, 0, 1, 1.157 1184, 2085, 0, 0, 1.066 1184, 2085, 0, 1, 1.066 1184, 2048, 37, 0, 0.993 1184, 2048, 37, 1, 0.993 1184, 2085, 37, 0, 1.04 1184, 2085, 37, 1, 1.04 1216, 0, 0, 0, 1.139 1216, 0, 0, 1, 1.139 1216, 38, 0, 0, 1.024 1216, 38, 0, 1, 1.024 1216, 0, 38, 0, 1.087 1216, 0, 38, 1, 1.087 1216, 38, 38, 0, 1.0 1216, 38, 38, 1, 1.0 1216, 2048, 0, 0, 1.138 1216, 2048, 0, 1, 1.138 1216, 2086, 0, 0, 1.024 1216, 2086, 0, 1, 1.024 1216, 2048, 38, 0, 1.01 1216, 2048, 38, 1, 1.01 1216, 2086, 38, 0, 1.0 1216, 2086, 38, 1, 1.0 1248, 0, 0, 0, 1.176 1248, 0, 0, 1, 1.174 1248, 39, 0, 0, 1.074 1248, 39, 0, 1, 1.074 1248, 0, 39, 0, 0.966 1248, 0, 39, 1, 0.985 1248, 39, 39, 0, 1.064 1248, 39, 39, 1, 1.064 1248, 2048, 0, 0, 1.179 1248, 2048, 0, 1, 1.179 1248, 2087, 0, 0, 1.074 1248, 2087, 0, 1, 1.074 1248, 2048, 39, 0, 0.985 1248, 2048, 39, 1, 0.985 1248, 2087, 39, 0, 1.026 1248, 2087, 39, 1, 1.026 1280, 0, 0, 0, 0.993 1280, 0, 0, 1, 0.993 1280, 40, 0, 0, 1.051 1280, 40, 0, 1, 1.051 1280, 0, 40, 0, 1.044 1280, 0, 40, 1, 1.045 1280, 40, 40, 0, 1.25 1280, 40, 40, 1, 1.25 1280, 2048, 0, 0, 0.992 1280, 2048, 0, 1, 0.992 1280, 2088, 0, 0, 1.051 1280, 2088, 0, 1, 1.051 1280, 2048, 40, 0, 0.946 1280, 2048, 40, 1, 0.946 1280, 2088, 40, 0, 1.252 1280, 2088, 40, 1, 1.252 1312, 0, 0, 0, 0.969 1312, 0, 0, 1, 0.969 1312, 41, 0, 0, 0.991 1312, 41, 0, 1, 0.991 1312, 0, 41, 0, 0.837 1312, 0, 41, 1, 0.837 1312, 41, 41, 0, 1.025 1312, 41, 41, 1, 1.025 1312, 2048, 0, 0, 0.969 1312, 2048, 0, 1, 0.969 1312, 2089, 0, 0, 0.991 1312, 2089, 0, 1, 0.99 1312, 2048, 41, 0, 0.837 1312, 2048, 41, 1, 0.837 1312, 2089, 41, 0, 0.975 1312, 2089, 41, 1, 0.975 1344, 0, 0, 0, 0.988 1344, 0, 0, 1, 0.988 1344, 42, 0, 0, 1.031 1344, 42, 0, 1, 1.031 1344, 0, 42, 0, 1.033 1344, 0, 42, 1, 1.033 1344, 42, 42, 0, 0.982 1344, 42, 42, 1, 0.982 1344, 2048, 0, 0, 0.992 1344, 2048, 0, 1, 0.992 1344, 2090, 0, 0, 1.031 1344, 2090, 0, 1, 1.031 1344, 2048, 42, 0, 0.943 1344, 2048, 42, 1, 0.942 1344, 2090, 42, 0, 0.982 1344, 2090, 42, 1, 0.982 1376, 0, 0, 0, 1.016 1376, 0, 0, 1, 1.016 1376, 43, 0, 0, 1.01 1376, 43, 0, 1, 1.01 1376, 0, 43, 0, 0.829 1376, 0, 43, 1, 0.829 1376, 43, 43, 0, 1.024 1376, 43, 43, 1, 1.024 1376, 2048, 0, 0, 1.006 1376, 2048, 0, 1, 1.015 1376, 2091, 0, 0, 1.01 1376, 2091, 0, 1, 1.01 1376, 2048, 43, 0, 0.829 1376, 2048, 43, 1, 0.829 1376, 2091, 43, 0, 0.98 1376, 2091, 43, 1, 0.98 1408, 0, 0, 0, 0.987 1408, 0, 0, 1, 0.987 1408, 44, 0, 0, 1.015 1408, 44, 0, 1, 1.015 1408, 0, 44, 0, 1.018 1408, 0, 44, 1, 1.014 1408, 44, 44, 0, 1.004 1408, 44, 44, 1, 0.994 1408, 2048, 0, 0, 0.988 1408, 2048, 0, 1, 0.988 1408, 2092, 0, 0, 1.015 1408, 2092, 0, 1, 1.015 1408, 2048, 44, 0, 0.955 1408, 2048, 44, 1, 0.955 1408, 2092, 44, 0, 1.0 1408, 2092, 44, 1, 0.994 1440, 0, 0, 0, 0.986 1440, 0, 0, 1, 0.986 1440, 45, 0, 0, 1.013 1440, 45, 0, 1, 1.013 1440, 0, 45, 0, 0.814 1440, 0, 45, 1, 0.814 1440, 45, 45, 0, 1.006 1440, 45, 45, 1, 1.006 1440, 2048, 0, 0, 0.986 1440, 2048, 0, 1, 0.986 1440, 2093, 0, 0, 1.013 1440, 2093, 0, 1, 1.013 1440, 2048, 45, 0, 0.814 1440, 2048, 45, 1, 0.814 1440, 2093, 45, 0, 0.966 1440, 2093, 45, 1, 0.966 1472, 0, 0, 0, 0.997 1472, 0, 0, 1, 0.994 1472, 46, 0, 0, 1.045 1472, 46, 0, 1, 1.045 1472, 0, 46, 0, 1.026 1472, 0, 46, 1, 1.026 1472, 46, 46, 0, 0.966 1472, 46, 46, 1, 0.966 1472, 2048, 0, 0, 1.0 1472, 2048, 0, 1, 0.996 1472, 2094, 0, 0, 1.045 1472, 2094, 0, 1, 1.045 1472, 2048, 46, 0, 0.939 1472, 2048, 46, 1, 0.939 1472, 2094, 46, 0, 0.966 1472, 2094, 46, 1, 0.966 1504, 0, 0, 0, 0.993 1504, 0, 0, 1, 0.993 1504, 47, 0, 0, 0.999 1504, 47, 0, 1, 0.999 1504, 0, 47, 0, 0.826 1504, 0, 47, 1, 0.826 1504, 47, 47, 0, 1.023 1504, 47, 47, 1, 1.023 1504, 2048, 0, 0, 0.993 1504, 2048, 0, 1, 0.993 1504, 2095, 0, 0, 0.999 1504, 2095, 0, 1, 0.999 1504, 2048, 47, 0, 0.826 1504, 2048, 47, 1, 0.826 1504, 2095, 47, 0, 0.993 1504, 2095, 47, 1, 0.993 1536, 0, 0, 0, 0.992 1536, 0, 0, 1, 0.991 1536, 48, 0, 0, 1.019 1536, 48, 0, 1, 1.019 1536, 0, 48, 0, 1.025 1536, 0, 48, 1, 1.024 1536, 48, 48, 0, 0.994 1536, 48, 48, 1, 0.994 1536, 2048, 0, 0, 0.994 1536, 2048, 0, 1, 0.994 1536, 2096, 0, 0, 1.019 1536, 2096, 0, 1, 1.019 1536, 2048, 48, 0, 1.025 1536, 2048, 48, 1, 1.025 1536, 2096, 48, 0, 0.994 1536, 2096, 48, 1, 0.994 1568, 0, 0, 0, 0.994 1568, 0, 0, 1, 0.994 1568, 49, 0, 0, 0.903 1568, 49, 0, 1, 0.903 1568, 0, 49, 0, 1.144 1568, 0, 49, 1, 1.144 1568, 49, 49, 0, 1.461 1568, 49, 49, 1, 1.461 1568, 2048, 0, 0, 0.993 1568, 2048, 0, 1, 0.993 1568, 2097, 0, 0, 0.903 1568, 2097, 0, 1, 0.903 1568, 2048, 49, 0, 1.09 1568, 2048, 49, 1, 1.09 1568, 2097, 49, 0, 1.46 1568, 2097, 49, 1, 1.46 1600, 0, 0, 0, 0.981 1600, 0, 0, 1, 0.981 1600, 50, 0, 0, 1.022 1600, 50, 0, 1, 1.022 1600, 0, 50, 0, 1.017 1600, 0, 50, 1, 1.017 1600, 50, 50, 0, 0.973 1600, 50, 50, 1, 0.973 1600, 2048, 0, 0, 0.981 1600, 2048, 0, 1, 0.981 1600, 2098, 0, 0, 1.022 1600, 2098, 0, 1, 1.022 1600, 2048, 50, 0, 0.961 1600, 2048, 50, 1, 0.961 1600, 2098, 50, 0, 0.973 1600, 2098, 50, 1, 0.973 1632, 0, 0, 0, 1.019 1632, 0, 0, 1, 1.019 1632, 51, 0, 0, 0.893 1632, 51, 0, 1, 0.893 1632, 0, 51, 0, 1.131 1632, 0, 51, 1, 1.131 1632, 51, 51, 0, 1.444 1632, 51, 51, 1, 1.444 1632, 2048, 0, 0, 1.019 1632, 2048, 0, 1, 1.019 1632, 2099, 0, 0, 0.893 1632, 2099, 0, 1, 0.893 1632, 2048, 51, 0, 1.079 1632, 2048, 51, 1, 1.079 1632, 2099, 51, 0, 1.449 1632, 2099, 51, 1, 1.449 1664, 0, 0, 0, 1.005 1664, 0, 0, 1, 1.004 1664, 52, 0, 0, 0.986 1664, 52, 0, 1, 0.986 1664, 0, 52, 0, 1.004 1664, 0, 52, 1, 1.004 1664, 52, 52, 0, 0.976 1664, 52, 52, 1, 0.976 1664, 2048, 0, 0, 1.006 1664, 2048, 0, 1, 1.006 1664, 2100, 0, 0, 0.993 1664, 2100, 0, 1, 0.993 1664, 2048, 52, 0, 0.946 1664, 2048, 52, 1, 0.946 1664, 2100, 52, 0, 0.976 1664, 2100, 52, 1, 0.976 1696, 0, 0, 0, 0.994 1696, 0, 0, 1, 0.992 1696, 53, 0, 0, 0.884 1696, 53, 0, 1, 0.884 1696, 0, 53, 0, 1.141 1696, 0, 53, 1, 1.141 1696, 53, 53, 0, 1.43 1696, 53, 53, 1, 1.43 1696, 2048, 0, 0, 0.994 1696, 2048, 0, 1, 0.994 1696, 2101, 0, 0, 0.884 1696, 2101, 0, 1, 0.884 1696, 2048, 53, 0, 1.088 1696, 2048, 53, 1, 1.088 1696, 2101, 53, 0, 1.429 1696, 2101, 53, 1, 1.429 1728, 0, 0, 0, 0.978 1728, 0, 0, 1, 0.978 1728, 54, 0, 0, 1.031 1728, 54, 0, 1, 1.033 1728, 0, 54, 0, 1.0 1728, 0, 54, 1, 1.0 1728, 54, 54, 0, 0.96 1728, 54, 54, 1, 0.96 1728, 2048, 0, 0, 0.976 1728, 2048, 0, 1, 0.976 1728, 2102, 0, 0, 1.033 1728, 2102, 0, 1, 1.033 1728, 2048, 54, 0, 0.947 1728, 2048, 54, 1, 0.947 1728, 2102, 54, 0, 0.96 1728, 2102, 54, 1, 0.96 1760, 0, 0, 0, 1.019 1760, 0, 0, 1, 1.021 1760, 55, 0, 0, 0.9 1760, 55, 0, 1, 0.9 1760, 0, 55, 0, 1.125 1760, 0, 55, 1, 1.125 1760, 55, 55, 0, 1.437 1760, 55, 55, 1, 1.436 1760, 2048, 0, 0, 1.016 1760, 2048, 0, 1, 1.015 1760, 2103, 0, 0, 0.9 1760, 2103, 0, 1, 0.9 1760, 2048, 55, 0, 1.073 1760, 2048, 55, 1, 1.074 1760, 2103, 55, 0, 1.44 1760, 2103, 55, 1, 1.44 1792, 0, 0, 0, 1.002 1792, 0, 0, 1, 1.002 1792, 56, 0, 0, 1.028 1792, 56, 0, 1, 1.028 1792, 0, 56, 0, 1.014 1792, 0, 56, 1, 1.015 1792, 56, 56, 0, 1.191 1792, 56, 56, 1, 1.191 1792, 2048, 0, 0, 1.003 1792, 2048, 0, 1, 1.003 1792, 2104, 0, 0, 1.028 1792, 2104, 0, 1, 1.028 1792, 2048, 56, 0, 0.963 1792, 2048, 56, 1, 0.963 1792, 2104, 56, 0, 1.191 1792, 2104, 56, 1, 1.191 1824, 0, 0, 0, 0.999 1824, 0, 0, 1, 1.0 1824, 57, 0, 0, 0.891 1824, 57, 0, 1, 0.891 1824, 0, 57, 0, 1.114 1824, 0, 57, 1, 1.114 1824, 57, 57, 0, 1.407 1824, 57, 57, 1, 1.407 1824, 2048, 0, 0, 1.001 1824, 2048, 0, 1, 1.001 1824, 2105, 0, 0, 0.891 1824, 2105, 0, 1, 0.891 1824, 2048, 57, 0, 1.064 1824, 2048, 57, 1, 1.064 1824, 2105, 57, 0, 1.407 1824, 2105, 57, 1, 1.407 1856, 0, 0, 0, 0.989 1856, 0, 0, 1, 0.987 1856, 58, 0, 0, 1.042 1856, 58, 0, 1, 1.042 1856, 0, 58, 0, 1.007 1856, 0, 58, 1, 1.007 1856, 58, 58, 0, 0.978 1856, 58, 58, 1, 0.972 1856, 2048, 0, 0, 0.992 1856, 2048, 0, 1, 0.992 1856, 2106, 0, 0, 1.042 1856, 2106, 0, 1, 1.042 1856, 2048, 58, 0, 0.954 1856, 2048, 58, 1, 0.954 1856, 2106, 58, 0, 0.979 1856, 2106, 58, 1, 0.972 1888, 0, 0, 0, 0.994 1888, 0, 0, 1, 0.994 1888, 59, 0, 0, 0.883 1888, 59, 0, 1, 0.883 1888, 0, 59, 0, 1.121 1888, 0, 59, 1, 1.123 1888, 59, 59, 0, 1.413 1888, 59, 59, 1, 1.413 1888, 2048, 0, 0, 0.985 1888, 2048, 0, 1, 0.994 1888, 2107, 0, 0, 0.883 1888, 2107, 0, 1, 0.883 1888, 2048, 59, 0, 1.076 1888, 2048, 59, 1, 1.076 1888, 2107, 59, 0, 1.413 1888, 2107, 59, 1, 1.413 1920, 0, 0, 0, 1.0 1920, 0, 0, 1, 0.999 1920, 60, 0, 0, 1.033 1920, 60, 0, 1, 1.033 1920, 0, 60, 0, 0.996 1920, 0, 60, 1, 0.997 1920, 60, 60, 0, 0.968 1920, 60, 60, 1, 0.968 1920, 2048, 0, 0, 1.0 1920, 2048, 0, 1, 1.0 1920, 2108, 0, 0, 1.034 1920, 2108, 0, 1, 1.034 1920, 2048, 60, 0, 0.949 1920, 2048, 60, 1, 0.949 1920, 2108, 60, 0, 0.968 1920, 2108, 60, 1, 0.968 1952, 0, 0, 0, 1.004 1952, 0, 0, 1, 1.004 1952, 61, 0, 0, 0.898 1952, 61, 0, 1, 0.898 1952, 0, 61, 0, 1.118 1952, 0, 61, 1, 1.118 1952, 61, 61, 0, 1.387 1952, 61, 61, 1, 1.387 1952, 2048, 0, 0, 1.004 1952, 2048, 0, 1, 1.004 1952, 2109, 0, 0, 0.898 1952, 2109, 0, 1, 0.898 1952, 2048, 61, 0, 1.071 1952, 2048, 61, 1, 1.071 1952, 2109, 61, 0, 1.387 1952, 2109, 61, 1, 1.387 1984, 0, 0, 0, 0.993 1984, 0, 0, 1, 0.993 1984, 62, 0, 0, 1.025 1984, 62, 0, 1, 1.025 1984, 0, 62, 0, 1.005 1984, 0, 62, 1, 1.007 1984, 62, 62, 0, 0.982 1984, 62, 62, 1, 0.982 1984, 2048, 0, 0, 0.993 1984, 2048, 0, 1, 0.993 1984, 2110, 0, 0, 1.025 1984, 2110, 0, 1, 1.025 1984, 2048, 62, 0, 0.96 1984, 2048, 62, 1, 0.96 1984, 2110, 62, 0, 0.982 1984, 2110, 62, 1, 0.982 2016, 0, 0, 0, 1.0 2016, 0, 0, 1, 0.999 2016, 63, 0, 0, 0.889 2016, 63, 0, 1, 0.89 2016, 0, 63, 0, 1.091 2016, 0, 63, 1, 1.092 2016, 63, 63, 0, 1.362 2016, 63, 63, 1, 1.363 2016, 2048, 0, 0, 1.0 2016, 2048, 0, 1, 1.0 2016, 2111, 0, 0, 0.965 2016, 2111, 0, 1, 0.965 2016, 2048, 63, 0, 1.049 2016, 2048, 63, 1, 1.049 2016, 2111, 63, 0, 1.405 2016, 2111, 63, 1, 1.405 2048, 32, 0, 0, 1.01 2048, 32, 0, 1, 1.01 2048, 0, 32, 0, 1.005 2048, 0, 32, 1, 1.005 2048, 32, 32, 0, 1.005 2048, 32, 32, 1, 1.005 2048, 0, 1, 0, 0.983 2048, 0, 1, 1, 0.984 2048, 1, 0, 0, 1.039 2048, 1, 0, 1, 1.039 2048, 32, 1, 0, 1.063 2048, 32, 1, 1, 1.063 2048, 1, 32, 0, 0.94 2048, 1, 32, 1, 0.94 2048, 2048, 1, 0, 0.981 2048, 2048, 1, 1, 0.981 2048, 2049, 0, 0, 0.904 2048, 2049, 0, 1, 0.904 2112, 0, 0, 0, 0.996 2112, 0, 0, 1, 0.995 2112, 1, 0, 0, 1.031 2112, 1, 0, 1, 1.031 2112, 33, 0, 0, 1.01 2112, 33, 0, 1, 1.01 2112, 0, 1, 0, 0.972 2112, 0, 1, 1, 0.972 2112, 0, 33, 0, 0.987 2112, 0, 33, 1, 0.987 2112, 1, 1, 0, 0.914 2112, 1, 1, 1, 0.914 2112, 33, 33, 0, 0.983 2112, 33, 33, 1, 0.983 2112, 2048, 0, 0, 0.994 2112, 2048, 0, 1, 0.99 2112, 2049, 0, 0, 1.031 2112, 2049, 0, 1, 1.031 2112, 2048, 1, 0, 0.955 2112, 2048, 1, 1, 0.955 2112, 2049, 1, 0, 0.906 2112, 2049, 1, 1, 0.906 2112, 33, 1, 0, 1.163 2112, 33, 1, 1, 1.164 2112, 1, 33, 0, 1.046 2112, 1, 33, 1, 1.046 2176, 0, 0, 0, 0.984 2176, 0, 0, 1, 0.985 2176, 2, 0, 0, 1.023 2176, 2, 0, 1, 1.023 2176, 34, 0, 0, 1.0 2176, 34, 0, 1, 1.0 2176, 0, 2, 0, 0.985 2176, 0, 2, 1, 0.985 2176, 0, 34, 0, 0.995 2176, 0, 34, 1, 0.982 2176, 2, 2, 0, 0.928 2176, 2, 2, 1, 0.928 2176, 34, 34, 0, 1.004 2176, 34, 34, 1, 1.004 2176, 2048, 0, 0, 0.985 2176, 2048, 0, 1, 0.986 2176, 2050, 0, 0, 1.023 2176, 2050, 0, 1, 1.023 2176, 2048, 2, 0, 0.802 2176, 2048, 2, 1, 0.802 2176, 2050, 2, 0, 0.894 2176, 2050, 2, 1, 0.894 2176, 2, 1, 0, 1.068 2176, 2, 1, 1, 1.068 2176, 1, 2, 0, 0.976 2176, 1, 2, 1, 0.976 2176, 34, 1, 0, 1.077 2176, 34, 1, 1, 1.077 2176, 1, 34, 0, 0.978 2176, 1, 34, 1, 0.978 2176, 2050, 1, 0, 1.061 2176, 2050, 1, 1, 1.061 2176, 2049, 2, 0, 0.971 2176, 2049, 2, 1, 0.971 2240, 0, 0, 0, 0.994 2240, 0, 0, 1, 0.994 2240, 3, 0, 0, 1.038 2240, 3, 0, 1, 1.039 2240, 35, 0, 0, 1.019 2240, 35, 0, 1, 1.019 2240, 0, 3, 0, 0.979 2240, 0, 3, 1, 0.98 2240, 0, 35, 0, 0.991 2240, 0, 35, 1, 0.991 2240, 3, 3, 0, 0.931 2240, 3, 3, 1, 0.931 2240, 35, 35, 0, 0.999 2240, 35, 35, 1, 0.999 2240, 2048, 0, 0, 0.995 2240, 2048, 0, 1, 0.995 2240, 2051, 0, 0, 1.039 2240, 2051, 0, 1, 1.039 2240, 2048, 3, 0, 0.799 2240, 2048, 3, 1, 0.799 2240, 2051, 3, 0, 0.889 2240, 2051, 3, 1, 0.889 2240, 3, 1, 0, 1.06 2240, 3, 1, 1, 1.06 2240, 1, 3, 0, 0.968 2240, 1, 3, 1, 0.968 2240, 35, 1, 0, 1.071 2240, 35, 1, 1, 1.071 2240, 1, 35, 0, 0.971 2240, 1, 35, 1, 0.971 2240, 2051, 1, 0, 1.057 2240, 2051, 1, 1, 1.057 2240, 2049, 3, 0, 0.966 2240, 2049, 3, 1, 0.966 2304, 0, 0, 0, 0.986 2304, 0, 0, 1, 0.986 2304, 4, 0, 0, 1.031 2304, 4, 0, 1, 1.032 2304, 36, 0, 0, 1.011 2304, 36, 0, 1, 1.011 2304, 0, 4, 0, 0.968 2304, 0, 4, 1, 0.969 2304, 0, 36, 0, 0.988 2304, 0, 36, 1, 0.988 2304, 4, 4, 0, 0.93 2304, 4, 4, 1, 0.931 2304, 36, 36, 0, 0.992 2304, 36, 36, 1, 0.992 2304, 2048, 0, 0, 0.988 2304, 2048, 0, 1, 0.988 2304, 2052, 0, 0, 1.032 2304, 2052, 0, 1, 1.032 2304, 2048, 4, 0, 0.793 2304, 2048, 4, 1, 0.793 2304, 2052, 4, 0, 0.884 2304, 2052, 4, 1, 0.884 2304, 4, 1, 0, 0.989 2304, 4, 1, 1, 0.989 2304, 1, 4, 0, 0.897 2304, 1, 4, 1, 0.898 2304, 36, 1, 0, 1.057 2304, 36, 1, 1, 1.057 2304, 1, 36, 0, 0.966 2304, 1, 36, 1, 0.966 2304, 2052, 1, 0, 1.052 2304, 2052, 1, 1, 1.052 2304, 2049, 4, 0, 0.955 2304, 2049, 4, 1, 0.955 2368, 0, 0, 0, 1.0 2368, 0, 0, 1, 1.001 2368, 5, 0, 0, 1.024 2368, 5, 0, 1, 1.025 2368, 37, 0, 0, 1.0 2368, 37, 0, 1, 1.0 2368, 0, 5, 0, 0.98 2368, 0, 5, 1, 0.981 2368, 0, 37, 0, 0.983 2368, 0, 37, 1, 0.98 2368, 5, 5, 0, 0.944 2368, 5, 5, 1, 0.944 2368, 37, 37, 0, 1.003 2368, 37, 37, 1, 1.003 2368, 2048, 0, 0, 1.002 2368, 2048, 0, 1, 1.002 2368, 2053, 0, 0, 1.025 2368, 2053, 0, 1, 1.025 2368, 2048, 5, 0, 0.801 2368, 2048, 5, 1, 0.801 2368, 2053, 5, 0, 0.907 2368, 2053, 5, 1, 0.907 2368, 5, 1, 0, 1.071 2368, 5, 1, 1, 1.071 2368, 1, 5, 0, 0.973 2368, 1, 5, 1, 0.973 2368, 37, 1, 0, 1.07 2368, 37, 1, 1, 1.07 2368, 1, 37, 0, 0.974 2368, 1, 37, 1, 0.974 2368, 2053, 1, 0, 1.065 2368, 2053, 1, 1, 1.065 2368, 2049, 5, 0, 0.967 2368, 2049, 5, 1, 0.967 2432, 0, 0, 0, 0.965 2432, 0, 0, 1, 1.0 2432, 6, 0, 0, 1.038 2432, 6, 0, 1, 1.039 2432, 38, 0, 0, 1.021 2432, 38, 0, 1, 1.021 2432, 0, 6, 0, 0.974 2432, 0, 6, 1, 0.976 2432, 0, 38, 0, 0.986 2432, 0, 38, 1, 0.986 2432, 6, 6, 0, 0.926 2432, 6, 6, 1, 0.926 2432, 38, 38, 0, 1.0 2432, 38, 38, 1, 1.0 2432, 2048, 0, 0, 1.004 2432, 2048, 0, 1, 1.004 2432, 2054, 0, 0, 1.039 2432, 2054, 0, 1, 1.039 2432, 2048, 6, 0, 0.797 2432, 2048, 6, 1, 0.797 2432, 2054, 6, 0, 0.898 2432, 2054, 6, 1, 0.898 2432, 6, 1, 0, 1.063 2432, 6, 1, 1, 1.063 2432, 1, 6, 0, 0.965 2432, 1, 6, 1, 0.965 2432, 38, 1, 0, 1.068 2432, 38, 1, 1, 1.068 2432, 1, 38, 0, 0.968 2432, 1, 38, 1, 0.968 2432, 2054, 1, 0, 1.06 2432, 2054, 1, 1, 1.06 2432, 2049, 6, 0, 0.963 2432, 2049, 6, 1, 0.963 2496, 0, 0, 0, 1.013 2496, 0, 0, 1, 1.013 2496, 7, 0, 0, 1.032 2496, 7, 0, 1, 1.032 2496, 39, 0, 0, 1.013 2496, 39, 0, 1, 1.013 2496, 0, 7, 0, 0.965 2496, 0, 7, 1, 0.965 2496, 0, 39, 0, 0.979 2496, 0, 39, 1, 0.979 2496, 7, 7, 0, 0.925 2496, 7, 7, 1, 0.925 2496, 39, 39, 0, 0.989 2496, 39, 39, 1, 0.989 2496, 2048, 0, 0, 1.013 2496, 2048, 0, 1, 1.013 2496, 2055, 0, 0, 1.032 2496, 2055, 0, 1, 1.032 2496, 2048, 7, 0, 0.792 2496, 2048, 7, 1, 0.792 2496, 2055, 7, 0, 0.93 2496, 2055, 7, 1, 0.93 2496, 7, 1, 0, 0.984 2496, 7, 1, 1, 0.984 2496, 1, 7, 0, 0.894 2496, 1, 7, 1, 0.895 2496, 39, 1, 0, 1.054 2496, 39, 1, 1, 1.054 2496, 1, 39, 0, 0.963 2496, 1, 39, 1, 0.963 2496, 2055, 1, 0, 1.049 2496, 2055, 1, 1, 1.049 2496, 2049, 7, 0, 0.953 2496, 2049, 7, 1, 0.953 2560, 0, 0, 0, 0.991 2560, 0, 0, 1, 0.991 2560, 8, 0, 0, 1.031 2560, 8, 0, 1, 1.032 2560, 40, 0, 0, 1.029 2560, 40, 0, 1, 1.029 2560, 0, 8, 0, 0.992 2560, 0, 8, 1, 0.992 2560, 0, 40, 0, 0.975 2560, 0, 40, 1, 0.984 2560, 8, 8, 0, 0.942 2560, 8, 8, 1, 0.943 2560, 40, 40, 0, 1.139 2560, 40, 40, 1, 1.139 2560, 2048, 0, 0, 0.993 2560, 2048, 0, 1, 0.993 2560, 2056, 0, 0, 1.032 2560, 2056, 0, 1, 1.032 2560, 2048, 8, 0, 0.812 2560, 2048, 8, 1, 0.812 2560, 2056, 8, 0, 0.912 2560, 2056, 8, 1, 0.912 2560, 8, 1, 0, 1.068 2560, 8, 1, 1, 1.069 2560, 1, 8, 0, 0.974 2560, 1, 8, 1, 0.974 2560, 40, 1, 0, 1.068 2560, 40, 1, 1, 1.068 2560, 1, 40, 0, 0.996 2560, 1, 40, 1, 0.996 2560, 2056, 1, 0, 1.063 2560, 2056, 1, 1, 1.063 2560, 2049, 8, 0, 0.969 2560, 2049, 8, 1, 0.969 2624, 0, 0, 0, 0.995 2624, 0, 0, 1, 0.994 2624, 9, 0, 0, 1.015 2624, 9, 0, 1, 1.018 2624, 41, 0, 0, 1.044 2624, 41, 0, 1, 1.044 2624, 0, 9, 0, 0.988 2624, 0, 9, 1, 0.99 2624, 0, 41, 0, 0.989 2624, 0, 41, 1, 0.99 2624, 9, 9, 0, 0.943 2624, 9, 9, 1, 0.943 2624, 41, 41, 0, 0.993 2624, 41, 41, 1, 0.993 2624, 2048, 0, 0, 0.998 2624, 2048, 0, 1, 0.998 2624, 2057, 0, 0, 1.018 2624, 2057, 0, 1, 1.018 2624, 2048, 9, 0, 0.81 2624, 2048, 9, 1, 0.81 2624, 2057, 9, 0, 0.907 2624, 2057, 9, 1, 0.907 2624, 9, 1, 0, 1.09 2624, 9, 1, 1, 1.09 2624, 1, 9, 0, 0.967 2624, 1, 9, 1, 0.967 2624, 41, 1, 0, 1.084 2624, 41, 1, 1, 1.085 2624, 1, 41, 0, 0.958 2624, 1, 41, 1, 0.957 2624, 2057, 1, 0, 1.087 2624, 2057, 1, 1, 1.087 2624, 2049, 9, 0, 0.965 2624, 2049, 9, 1, 0.965 2688, 0, 0, 0, 0.995 2688, 0, 0, 1, 0.995 2688, 10, 0, 0, 1.01 2688, 10, 0, 1, 1.012 2688, 42, 0, 0, 1.036 2688, 42, 0, 1, 1.036 2688, 0, 10, 0, 0.978 2688, 0, 10, 1, 0.979 2688, 0, 42, 0, 0.977 2688, 0, 42, 1, 0.978 2688, 10, 10, 0, 0.942 2688, 10, 10, 1, 0.942 2688, 42, 42, 0, 0.989 2688, 42, 42, 1, 0.989 2688, 2048, 0, 0, 0.995 2688, 2048, 0, 1, 0.995 2688, 2058, 0, 0, 1.012 2688, 2058, 0, 1, 1.012 2688, 2048, 10, 0, 0.804 2688, 2048, 10, 1, 0.804 2688, 2058, 10, 0, 0.905 2688, 2058, 10, 1, 0.905 2688, 10, 1, 0, 0.986 2688, 10, 1, 1, 0.987 2688, 1, 10, 0, 0.893 2688, 1, 10, 1, 0.894 2688, 42, 1, 0, 1.054 2688, 42, 1, 1, 1.054 2688, 1, 42, 0, 0.958 2688, 1, 42, 1, 0.958 2688, 2058, 1, 0, 1.052 2688, 2058, 1, 1, 1.052 2688, 2049, 10, 0, 0.954 2688, 2049, 10, 1, 0.954 2752, 0, 0, 0, 1.0 2752, 0, 0, 1, 0.992 2752, 11, 0, 0, 0.954 2752, 11, 0, 1, 0.954 2752, 43, 0, 0, 0.979 2752, 43, 0, 1, 0.979 2752, 0, 11, 0, 0.939 2752, 0, 11, 1, 0.939 2752, 0, 43, 0, 0.931 2752, 0, 43, 1, 0.932 2752, 11, 11, 0, 0.949 2752, 11, 11, 1, 0.949 2752, 43, 43, 0, 1.007 2752, 43, 43, 1, 1.007 2752, 2048, 0, 0, 0.993 2752, 2048, 0, 1, 0.993 2752, 2059, 0, 0, 0.954 2752, 2059, 0, 1, 0.954 2752, 2048, 11, 0, 0.77 2752, 2048, 11, 1, 0.77 2752, 2059, 11, 0, 0.916 2752, 2059, 11, 1, 0.916 2752, 11, 1, 0, 0.994 2752, 11, 1, 1, 0.994 2752, 1, 11, 0, 0.928 2752, 1, 11, 1, 0.928 2752, 43, 1, 0, 1.022 2752, 43, 1, 1, 1.022 2752, 1, 43, 0, 0.92 2752, 1, 43, 1, 0.92 2752, 2059, 1, 0, 0.989 2752, 2059, 1, 1, 0.989 2752, 2049, 11, 0, 0.923 2752, 2049, 11, 1, 0.923 2816, 0, 0, 0, 1.003 2816, 0, 0, 1, 1.003 2816, 12, 0, 0, 0.897 2816, 12, 0, 1, 0.894 2816, 44, 0, 0, 0.914 2816, 44, 0, 1, 0.914 2816, 0, 12, 0, 0.876 2816, 0, 12, 1, 0.874 2816, 0, 44, 0, 0.871 2816, 0, 44, 1, 0.87 2816, 12, 12, 0, 0.948 2816, 12, 12, 1, 0.948 2816, 44, 44, 0, 1.009 2816, 44, 44, 1, 1.009 2816, 2048, 0, 0, 1.005 2816, 2048, 0, 1, 1.005 2816, 2060, 0, 0, 0.894 2816, 2060, 0, 1, 0.894 2816, 2048, 12, 0, 0.714 2816, 2048, 12, 1, 0.713 2816, 2060, 12, 0, 0.915 2816, 2060, 12, 1, 0.915 2816, 12, 1, 0, 0.917 2816, 12, 1, 1, 0.917 2816, 1, 12, 0, 0.858 2816, 1, 12, 1, 0.857 2816, 44, 1, 0, 0.944 2816, 44, 1, 1, 0.943 2816, 1, 44, 0, 0.856 2816, 1, 44, 1, 0.856 2816, 2060, 1, 0, 0.914 2816, 2060, 1, 1, 0.914 2816, 2049, 12, 0, 0.855 2816, 2049, 12, 1, 0.855 2880, 0, 0, 0, 0.989 2880, 0, 0, 1, 0.989 2880, 13, 0, 0, 0.967 2880, 13, 0, 1, 0.967 2880, 45, 0, 0, 0.987 2880, 45, 0, 1, 0.987 2880, 0, 13, 0, 0.925 2880, 0, 13, 1, 0.925 2880, 0, 45, 0, 0.927 2880, 0, 45, 1, 0.927 2880, 13, 13, 0, 0.944 2880, 13, 13, 1, 0.944 2880, 45, 45, 0, 1.003 2880, 45, 45, 1, 1.003 2880, 2048, 0, 0, 0.989 2880, 2048, 0, 1, 0.989 2880, 2061, 0, 0, 0.967 2880, 2061, 0, 1, 0.967 2880, 2048, 13, 0, 0.76 2880, 2048, 13, 1, 0.76 2880, 2061, 13, 0, 0.91 2880, 2061, 13, 1, 0.91 2880, 13, 1, 0, 0.922 2880, 13, 1, 1, 0.922 2880, 1, 13, 0, 0.859 2880, 1, 13, 1, 0.859 2880, 45, 1, 0, 1.013 2880, 45, 1, 1, 1.013 2880, 1, 45, 0, 0.92 2880, 1, 45, 1, 0.92 2880, 2061, 1, 0, 0.984 2880, 2061, 1, 1, 0.984 2880, 2049, 13, 0, 0.918 2880, 2049, 13, 1, 0.918 2944, 0, 0, 0, 1.014 2944, 0, 0, 1, 1.014 2944, 14, 0, 0, 0.956 2944, 14, 0, 1, 0.955 2944, 46, 0, 0, 0.979 2944, 46, 0, 1, 0.979 2944, 0, 14, 0, 0.937 2944, 0, 14, 1, 0.937 2944, 0, 46, 0, 0.93 2944, 0, 46, 1, 0.93 2944, 14, 14, 0, 0.953 2944, 14, 14, 1, 0.953 2944, 46, 46, 0, 1.009 2944, 46, 46, 1, 1.009 2944, 2048, 0, 0, 1.015 2944, 2048, 0, 1, 1.015 2944, 2062, 0, 0, 0.955 2944, 2062, 0, 1, 0.955 2944, 2048, 14, 0, 0.769 2944, 2048, 14, 1, 0.769 2944, 2062, 14, 0, 0.923 2944, 2062, 14, 1, 0.923 2944, 14, 1, 0, 0.994 2944, 14, 1, 1, 0.994 2944, 1, 14, 0, 0.927 2944, 1, 14, 1, 0.927 2944, 46, 1, 0, 1.021 2944, 46, 1, 1, 1.021 2944, 1, 46, 0, 0.923 2944, 1, 46, 1, 0.923 2944, 2062, 1, 0, 0.988 2944, 2062, 1, 1, 0.988 2944, 2049, 14, 0, 0.922 2944, 2049, 14, 1, 0.922 3008, 0, 0, 0, 0.994 3008, 0, 0, 1, 0.994 3008, 15, 0, 0, 0.941 3008, 15, 0, 1, 0.941 3008, 47, 0, 0, 0.996 3008, 47, 0, 1, 0.996 3008, 0, 15, 0, 0.929 3008, 0, 15, 1, 0.933 3008, 0, 47, 0, 0.933 3008, 0, 47, 1, 0.933 3008, 15, 15, 0, 0.952 3008, 15, 15, 1, 0.949 3008, 47, 47, 0, 1.003 3008, 47, 47, 1, 1.003 3008, 2048, 0, 0, 0.998 3008, 2048, 0, 1, 0.998 3008, 2063, 0, 0, 0.941 3008, 2063, 0, 1, 0.941 3008, 2048, 15, 0, 0.766 3008, 2048, 15, 1, 0.766 3008, 2063, 15, 0, 0.916 3008, 2063, 15, 1, 0.916 3008, 15, 1, 0, 0.985 3008, 15, 1, 1, 0.985 3008, 1, 15, 0, 0.916 3008, 1, 15, 1, 0.916 3008, 47, 1, 0, 1.014 3008, 47, 1, 1, 1.014 3008, 1, 47, 0, 0.902 3008, 1, 47, 1, 0.902 3008, 2063, 1, 0, 0.981 3008, 2063, 1, 1, 0.981 3008, 2049, 15, 0, 0.912 3008, 2049, 15, 1, 0.913 3072, 0, 0, 0, 1.016 3072, 0, 0, 1, 1.015 3072, 16, 0, 0, 1.045 3072, 16, 0, 1, 1.045 3072, 48, 0, 0, 1.045 3072, 48, 0, 1, 1.045 3072, 0, 16, 0, 1.049 3072, 0, 16, 1, 1.049 3072, 0, 48, 0, 1.049 3072, 0, 48, 1, 1.049 3072, 16, 16, 0, 1.016 3072, 16, 16, 1, 1.016 3072, 48, 48, 0, 1.016 3072, 48, 48, 1, 1.016 3072, 2048, 0, 0, 1.016 3072, 2048, 0, 1, 1.016 3072, 2064, 0, 0, 1.045 3072, 2064, 0, 1, 1.045 3072, 2048, 16, 0, 1.049 3072, 2048, 16, 1, 1.049 3072, 2064, 16, 0, 1.016 3072, 2064, 16, 1, 1.016 3072, 16, 1, 0, 0.815 3072, 16, 1, 1, 0.815 3072, 1, 16, 0, 0.872 3072, 1, 16, 1, 0.872 3072, 48, 1, 0, 1.017 3072, 48, 1, 1, 1.017 3072, 1, 48, 0, 0.872 3072, 1, 48, 1, 0.872 3072, 2064, 1, 0, 0.815 3072, 2064, 1, 1, 0.815 3072, 2049, 16, 0, 0.872 3072, 2049, 16, 1, 0.872 3136, 0, 0, 0, 0.995 3136, 0, 0, 1, 0.995 3136, 17, 0, 0, 0.949 3136, 17, 0, 1, 0.949 3136, 49, 0, 0, 0.987 3136, 49, 0, 1, 0.987 3136, 0, 17, 0, 0.919 3136, 0, 17, 1, 0.917 3136, 0, 49, 0, 0.931 3136, 0, 49, 1, 0.931 3136, 17, 17, 0, 1.122 3136, 17, 17, 1, 1.119 3136, 49, 49, 0, 0.987 3136, 49, 49, 1, 0.987 3136, 2048, 0, 0, 0.997 3136, 2048, 0, 1, 0.997 3136, 2065, 0, 0, 0.949 3136, 2065, 0, 1, 0.949 3136, 2048, 17, 0, 0.896 3136, 2048, 17, 1, 0.896 3136, 2065, 17, 0, 1.122 3136, 2065, 17, 1, 1.119 3136, 17, 1, 0, 1.184 3136, 17, 1, 1, 1.184 3136, 1, 17, 0, 1.124 3136, 1, 17, 1, 1.125 3136, 49, 1, 0, 1.11 3136, 49, 1, 1, 1.108 3136, 1, 49, 0, 1.044 3136, 1, 49, 1, 1.044 3136, 2065, 1, 0, 1.147 3136, 2065, 1, 1, 1.147 3136, 2049, 17, 0, 1.102 3136, 2049, 17, 1, 1.1 3200, 0, 0, 0, 1.006 3200, 0, 0, 1, 1.006 3200, 18, 0, 0, 0.978 3200, 18, 0, 1, 0.978 3200, 50, 0, 0, 0.998 3200, 50, 0, 1, 0.998 3200, 0, 18, 0, 0.932 3200, 0, 18, 1, 0.932 3200, 0, 50, 0, 0.93 3200, 0, 50, 1, 0.93 3200, 18, 18, 0, 1.11 3200, 18, 18, 1, 1.11 3200, 50, 50, 0, 0.994 3200, 50, 50, 1, 0.994 3200, 2048, 0, 0, 1.007 3200, 2048, 0, 1, 1.007 3200, 2066, 0, 0, 0.978 3200, 2066, 0, 1, 0.978 3200, 2048, 18, 0, 0.894 3200, 2048, 18, 1, 0.894 3200, 2066, 18, 0, 1.11 3200, 2066, 18, 1, 1.11 3200, 18, 1, 0, 1.002 3200, 18, 1, 1, 1.002 3200, 1, 18, 0, 0.917 3200, 1, 18, 1, 0.917 3200, 50, 1, 0, 0.963 3200, 50, 1, 1, 0.964 3200, 1, 50, 0, 0.888 3200, 1, 50, 1, 0.888 3200, 2066, 1, 0, 1.002 3200, 2066, 1, 1, 1.002 3200, 2049, 18, 0, 0.914 3200, 2049, 18, 1, 0.914 3264, 0, 0, 0, 0.994 3264, 0, 0, 1, 0.994 3264, 19, 0, 0, 0.959 3264, 19, 0, 1, 0.959 3264, 51, 0, 0, 0.994 3264, 51, 0, 1, 0.994 3264, 0, 19, 0, 0.927 3264, 0, 19, 1, 0.927 3264, 0, 51, 0, 0.927 3264, 0, 51, 1, 0.927 3264, 19, 19, 0, 1.1 3264, 19, 19, 1, 1.1 3264, 51, 51, 0, 0.982 3264, 51, 51, 1, 0.982 3264, 2048, 0, 0, 0.994 3264, 2048, 0, 1, 0.994 3264, 2067, 0, 0, 0.959 3264, 2067, 0, 1, 0.959 3264, 2048, 19, 0, 0.891 3264, 2048, 19, 1, 0.891 3264, 2067, 19, 0, 1.099 3264, 2067, 19, 1, 1.099 3264, 19, 1, 0, 0.977 3264, 19, 1, 1, 0.976 3264, 1, 19, 0, 0.921 3264, 1, 19, 1, 0.921 3264, 51, 1, 0, 0.959 3264, 51, 1, 1, 0.959 3264, 1, 51, 0, 0.886 3264, 1, 51, 1, 0.886 3264, 2067, 1, 0, 0.976 3264, 2067, 1, 1, 0.976 3264, 2049, 19, 0, 0.917 3264, 2049, 19, 1, 0.917 3328, 0, 0, 0, 0.996 3328, 0, 0, 1, 0.992 3328, 20, 0, 0, 0.955 3328, 20, 0, 1, 0.955 3328, 52, 0, 0, 0.99 3328, 52, 0, 1, 0.99 3328, 0, 20, 0, 0.926 3328, 0, 20, 1, 0.923 3328, 0, 52, 0, 0.933 3328, 0, 52, 1, 0.933 3328, 20, 20, 0, 1.11 3328, 20, 20, 1, 1.11 3328, 52, 52, 0, 0.988 3328, 52, 52, 1, 0.988 3328, 2048, 0, 0, 0.993 3328, 2048, 0, 1, 0.993 3328, 2068, 0, 0, 0.955 3328, 2068, 0, 1, 0.955 3328, 2048, 20, 0, 0.9 3328, 2048, 20, 1, 0.9 3328, 2068, 20, 0, 1.109 3328, 2068, 20, 1, 1.109 3328, 20, 1, 0, 0.99 3328, 20, 1, 1, 0.99 3328, 1, 20, 0, 0.922 3328, 1, 20, 1, 0.922 3328, 52, 1, 0, 0.972 3328, 52, 1, 1, 0.972 3328, 1, 52, 0, 0.901 3328, 1, 52, 1, 0.901 3328, 2068, 1, 0, 0.99 3328, 2068, 1, 1, 0.99 3328, 2049, 20, 0, 0.918 3328, 2049, 20, 1, 0.918 3392, 0, 0, 0, 0.998 3392, 0, 0, 1, 1.0 3392, 21, 0, 0, 0.964 3392, 21, 0, 1, 0.964 3392, 53, 0, 0, 0.998 3392, 53, 0, 1, 0.998 3392, 0, 21, 0, 0.932 3392, 0, 21, 1, 0.932 3392, 0, 53, 0, 0.93 3392, 0, 53, 1, 0.93 3392, 21, 21, 0, 1.113 3392, 21, 21, 1, 1.113 3392, 53, 53, 0, 0.983 3392, 53, 53, 1, 0.983 3392, 2048, 0, 0, 1.0 3392, 2048, 0, 1, 1.0 3392, 2069, 0, 0, 0.964 3392, 2069, 0, 1, 0.964 3392, 2048, 21, 0, 0.895 3392, 2048, 21, 1, 0.896 3392, 2069, 21, 0, 1.113 3392, 2069, 21, 1, 1.113 3392, 21, 1, 0, 0.994 3392, 21, 1, 1, 0.994 3392, 1, 21, 0, 0.923 3392, 1, 21, 1, 0.923 3392, 53, 1, 0, 0.972 3392, 53, 1, 1, 0.972 3392, 1, 53, 0, 0.891 3392, 1, 53, 1, 0.891 3392, 2069, 1, 0, 0.994 3392, 2069, 1, 1, 0.994 3392, 2049, 21, 0, 0.922 3392, 2049, 21, 1, 0.922 3456, 0, 0, 0, 0.995 3456, 0, 0, 1, 0.995 3456, 22, 0, 0, 0.965 3456, 22, 0, 1, 0.965 3456, 54, 0, 0, 0.996 3456, 54, 0, 1, 0.996 3456, 0, 22, 0, 0.927 3456, 0, 22, 1, 0.927 3456, 0, 54, 0, 0.927 3456, 0, 54, 1, 0.927 3456, 22, 22, 0, 1.107 3456, 22, 22, 1, 1.107 3456, 54, 54, 0, 0.98 3456, 54, 54, 1, 0.98 3456, 2048, 0, 0, 0.995 3456, 2048, 0, 1, 0.995 3456, 2070, 0, 0, 0.965 3456, 2070, 0, 1, 0.965 3456, 2048, 22, 0, 0.893 3456, 2048, 22, 1, 0.893 3456, 2070, 22, 0, 1.107 3456, 2070, 22, 1, 1.107 3456, 22, 1, 0, 0.988 3456, 22, 1, 1, 0.988 3456, 1, 22, 0, 0.921 3456, 1, 22, 1, 0.921 3456, 54, 1, 0, 0.963 3456, 54, 1, 1, 0.963 3456, 1, 54, 0, 0.887 3456, 1, 54, 1, 0.887 3456, 2070, 1, 0, 0.988 3456, 2070, 1, 1, 0.988 3456, 2049, 22, 0, 0.917 3456, 2049, 22, 1, 0.917 3520, 0, 0, 0, 1.016 3520, 0, 0, 1, 1.016 3520, 23, 0, 0, 0.957 3520, 23, 0, 1, 0.957 3520, 55, 0, 0, 0.991 3520, 55, 0, 1, 0.991 3520, 0, 23, 0, 0.919 3520, 0, 23, 1, 0.924 3520, 0, 55, 0, 0.934 3520, 0, 55, 1, 0.934 3520, 23, 23, 0, 1.111 3520, 23, 23, 1, 1.111 3520, 55, 55, 0, 0.994 3520, 55, 55, 1, 0.994 3520, 2048, 0, 0, 1.016 3520, 2048, 0, 1, 1.016 3520, 2071, 0, 0, 0.957 3520, 2071, 0, 1, 0.957 3520, 2048, 23, 0, 0.903 3520, 2048, 23, 1, 0.903 3520, 2071, 23, 0, 1.111 3520, 2071, 23, 1, 1.111 3520, 23, 1, 0, 0.997 3520, 23, 1, 1, 0.997 3520, 1, 23, 0, 0.921 3520, 1, 23, 1, 0.921 3520, 55, 1, 0, 0.976 3520, 55, 1, 1, 0.976 3520, 1, 55, 0, 0.902 3520, 1, 55, 1, 0.902 3520, 2071, 1, 0, 0.997 3520, 2071, 1, 1, 0.997 3520, 2049, 23, 0, 0.918 3520, 2049, 23, 1, 0.918 3584, 0, 0, 0, 1.004 3584, 0, 0, 1, 1.004 3584, 24, 0, 0, 0.985 3584, 24, 0, 1, 0.979 3584, 56, 0, 0, 1.006 3584, 56, 0, 1, 1.006 3584, 0, 24, 0, 0.931 3584, 0, 24, 1, 0.931 3584, 0, 56, 0, 0.93 3584, 0, 56, 1, 0.93 3584, 24, 24, 0, 1.111 3584, 24, 24, 1, 1.11 3584, 56, 56, 0, 1.101 3584, 56, 56, 1, 1.1 3584, 2048, 0, 0, 1.005 3584, 2048, 0, 1, 1.005 3584, 2072, 0, 0, 0.98 3584, 2072, 0, 1, 0.978 3584, 2048, 24, 0, 0.896 3584, 2048, 24, 1, 0.897 3584, 2072, 24, 0, 1.111 3584, 2072, 24, 1, 1.111 3584, 24, 1, 0, 1.004 3584, 24, 1, 1, 1.004 3584, 1, 24, 0, 0.921 3584, 1, 24, 1, 0.921 3584, 56, 1, 0, 0.971 3584, 56, 1, 1, 0.97 3584, 1, 56, 0, 0.89 3584, 1, 56, 1, 0.89 3584, 2072, 1, 0, 1.004 3584, 2072, 1, 1, 1.004 3584, 2049, 24, 0, 0.918 3584, 2049, 24, 1, 0.918 3648, 0, 0, 0, 1.012 3648, 0, 0, 1, 1.012 3648, 25, 0, 0, 0.96 3648, 25, 0, 1, 0.96 3648, 57, 0, 0, 0.988 3648, 57, 0, 1, 0.988 3648, 0, 25, 0, 0.927 3648, 0, 25, 1, 0.927 3648, 0, 57, 0, 0.927 3648, 0, 57, 1, 0.927 3648, 25, 25, 0, 1.101 3648, 25, 25, 1, 1.101 3648, 57, 57, 0, 0.986 3648, 57, 57, 1, 0.986 3648, 2048, 0, 0, 1.012 3648, 2048, 0, 1, 1.012 3648, 2073, 0, 0, 0.96 3648, 2073, 0, 1, 0.959 3648, 2048, 25, 0, 0.894 3648, 2048, 25, 1, 0.895 3648, 2073, 25, 0, 1.103 3648, 2073, 25, 1, 1.103 3648, 25, 1, 0, 1.024 3648, 25, 1, 1, 1.024 3648, 1, 25, 0, 0.911 3648, 1, 25, 1, 0.912 3648, 57, 1, 0, 0.973 3648, 57, 1, 1, 0.974 3648, 1, 57, 0, 0.888 3648, 1, 57, 1, 0.888 3648, 2073, 1, 0, 1.024 3648, 2073, 1, 1, 1.024 3648, 2049, 25, 0, 0.907 3648, 2049, 25, 1, 0.907 3712, 0, 0, 0, 0.996 3712, 0, 0, 1, 0.996 3712, 26, 0, 0, 0.96 3712, 26, 0, 1, 0.96 3712, 58, 0, 0, 0.995 3712, 58, 0, 1, 0.995 3712, 0, 26, 0, 0.919 3712, 0, 26, 1, 0.918 3712, 0, 58, 0, 0.93 3712, 0, 58, 1, 0.93 3712, 26, 26, 0, 1.103 3712, 26, 26, 1, 1.102 3712, 58, 58, 0, 0.989 3712, 58, 58, 1, 0.989 3712, 2048, 0, 0, 0.997 3712, 2048, 0, 1, 0.997 3712, 2074, 0, 0, 0.959 3712, 2074, 0, 1, 0.959 3712, 2048, 26, 0, 0.901 3712, 2048, 26, 1, 0.901 3712, 2074, 26, 0, 1.104 3712, 2074, 26, 1, 1.102 3712, 26, 1, 0, 1.001 3712, 26, 1, 1, 1.001 3712, 1, 26, 0, 0.922 3712, 1, 26, 1, 0.922 3712, 58, 1, 0, 0.974 3712, 58, 1, 1, 0.974 3712, 1, 58, 0, 0.903 3712, 1, 58, 1, 0.903 3712, 2074, 1, 0, 1.001 3712, 2074, 1, 1, 1.001 3712, 2049, 26, 0, 0.919 3712, 2049, 26, 1, 0.919 3776, 0, 0, 0, 1.003 3776, 0, 0, 1, 1.003 3776, 27, 0, 0, 0.964 3776, 27, 0, 1, 0.964 3776, 59, 0, 0, 1.004 3776, 59, 0, 1, 1.004 3776, 0, 27, 0, 0.931 3776, 0, 27, 1, 0.931 3776, 0, 59, 0, 0.929 3776, 0, 59, 1, 0.93 3776, 27, 27, 0, 1.097 3776, 27, 27, 1, 1.097 3776, 59, 59, 0, 0.992 3776, 59, 59, 1, 0.992 3776, 2048, 0, 0, 1.003 3776, 2048, 0, 1, 1.003 3776, 2075, 0, 0, 0.963 3776, 2075, 0, 1, 0.964 3776, 2048, 27, 0, 0.898 3776, 2048, 27, 1, 0.898 3776, 2075, 27, 0, 1.097 3776, 2075, 27, 1, 1.097 3776, 27, 1, 0, 0.998 3776, 27, 1, 1, 0.998 3776, 1, 27, 0, 0.925 3776, 1, 27, 1, 0.925 3776, 59, 1, 0, 0.979 3776, 59, 1, 1, 0.979 3776, 1, 59, 0, 0.894 3776, 1, 59, 1, 0.894 3776, 2075, 1, 0, 0.998 3776, 2075, 1, 1, 0.999 3776, 2049, 27, 0, 0.923 3776, 2049, 27, 1, 0.923 3840, 0, 0, 0, 0.997 3840, 0, 0, 1, 0.997 3840, 28, 0, 0, 0.968 3840, 28, 0, 1, 0.968 3840, 60, 0, 0, 1.001 3840, 60, 0, 1, 1.001 3840, 0, 28, 0, 0.926 3840, 0, 28, 1, 0.927 3840, 0, 60, 0, 0.927 3840, 0, 60, 1, 0.927 3840, 28, 28, 0, 1.094 3840, 28, 28, 1, 1.094 3840, 60, 60, 0, 0.982 3840, 60, 60, 1, 0.982 3840, 2048, 0, 0, 0.998 3840, 2048, 0, 1, 0.998 3840, 2076, 0, 0, 0.968 3840, 2076, 0, 1, 0.968 3840, 2048, 28, 0, 0.896 3840, 2048, 28, 1, 0.896 3840, 2076, 28, 0, 1.094 3840, 2076, 28, 1, 1.094 3840, 28, 1, 0, 0.983 3840, 28, 1, 1, 0.982 3840, 1, 28, 0, 0.916 3840, 1, 28, 1, 0.916 3840, 60, 1, 0, 0.969 3840, 60, 1, 1, 0.969 3840, 1, 60, 0, 0.891 3840, 1, 60, 1, 0.891 3840, 2076, 1, 0, 0.983 3840, 2076, 1, 1, 0.983 3840, 2049, 28, 0, 0.912 3840, 2049, 28, 1, 0.912 3904, 0, 0, 0, 1.002 3904, 0, 0, 1, 1.0 3904, 29, 0, 0, 0.961 3904, 29, 0, 1, 0.961 3904, 61, 0, 0, 0.997 3904, 61, 0, 1, 0.997 3904, 0, 29, 0, 0.915 3904, 0, 29, 1, 0.922 3904, 0, 61, 0, 0.933 3904, 0, 61, 1, 0.933 3904, 29, 29, 0, 1.103 3904, 29, 29, 1, 1.103 3904, 61, 61, 0, 0.995 3904, 61, 61, 1, 0.995 3904, 2048, 0, 0, 0.998 3904, 2048, 0, 1, 1.0 3904, 2077, 0, 0, 0.961 3904, 2077, 0, 1, 0.961 3904, 2048, 29, 0, 0.904 3904, 2048, 29, 1, 0.904 3904, 2077, 29, 0, 1.103 3904, 2077, 29, 1, 1.103 3904, 29, 1, 0, 1.0 3904, 29, 1, 1, 1.0 3904, 1, 29, 0, 0.922 3904, 1, 29, 1, 0.922 3904, 61, 1, 0, 0.98 3904, 61, 1, 1, 0.98 3904, 1, 61, 0, 0.904 3904, 1, 61, 1, 0.904 3904, 2077, 1, 0, 1.0 3904, 2077, 1, 1, 1.0 3904, 2049, 29, 0, 0.919 3904, 2049, 29, 1, 0.919 3968, 0, 0, 0, 1.003 3968, 0, 0, 1, 1.003 3968, 30, 0, 0, 0.969 3968, 30, 0, 1, 0.969 3968, 62, 0, 0, 1.006 3968, 62, 0, 1, 1.006 3968, 0, 30, 0, 0.931 3968, 0, 30, 1, 0.93 3968, 0, 62, 0, 0.929 3968, 0, 62, 1, 0.929 3968, 30, 30, 0, 1.103 3968, 30, 30, 1, 1.103 3968, 62, 62, 0, 0.99 3968, 62, 62, 1, 0.99 3968, 2048, 0, 0, 1.004 3968, 2048, 0, 1, 1.004 3968, 2078, 0, 0, 0.969 3968, 2078, 0, 1, 0.969 3968, 2048, 30, 0, 0.899 3968, 2048, 30, 1, 0.899 3968, 2078, 30, 0, 1.105 3968, 2078, 30, 1, 1.105 3968, 30, 1, 0, 0.993 3968, 30, 1, 1, 0.993 3968, 1, 30, 0, 0.908 3968, 1, 30, 1, 0.908 3968, 62, 1, 0, 0.978 3968, 62, 1, 1, 0.978 3968, 1, 62, 0, 0.895 3968, 1, 62, 1, 0.895 3968, 2078, 1, 0, 0.993 3968, 2078, 1, 1, 0.993 3968, 2049, 30, 0, 0.904 3968, 2049, 30, 1, 0.904 4032, 0, 0, 0, 0.995 4032, 0, 0, 1, 0.995 4032, 31, 0, 0, 0.967 4032, 31, 0, 1, 0.967 4032, 63, 0, 0, 1.002 4032, 63, 0, 1, 1.002 4032, 0, 31, 0, 0.927 4032, 0, 31, 1, 0.926 4032, 0, 63, 0, 0.927 4032, 0, 63, 1, 0.927 4032, 31, 31, 0, 1.09 4032, 31, 31, 1, 1.09 4032, 63, 63, 0, 0.987 4032, 63, 63, 1, 0.987 4032, 2048, 0, 0, 0.995 4032, 2048, 0, 1, 0.995 4032, 2079, 0, 0, 0.967 4032, 2079, 0, 1, 0.967 4032, 2048, 31, 0, 0.897 4032, 2048, 31, 1, 0.897 4032, 2079, 31, 0, 1.09 4032, 2079, 31, 1, 1.09 4032, 31, 1, 0, 0.989 4032, 31, 1, 1, 0.989 4032, 1, 31, 0, 0.911 4032, 1, 31, 1, 0.911 4032, 63, 1, 0, 0.971 4032, 63, 1, 1, 0.972 4032, 1, 63, 0, 0.892 4032, 1, 63, 1, 0.892 4032, 2079, 1, 0, 0.989 4032, 2079, 1, 1, 0.989 4032, 2049, 31, 0, 0.907 4032, 2049, 31, 1, 0.907 4096, 32, 0, 0, 1.014 4096, 32, 0, 1, 1.014 4096, 64, 0, 0, 1.014 4096, 64, 0, 1, 1.014 4096, 0, 32, 0, 1.012 4096, 0, 32, 1, 1.012 4096, 0, 64, 0, 1.012 4096, 0, 64, 1, 1.012 4096, 32, 32, 0, 1.014 4096, 32, 32, 1, 1.014 4096, 64, 64, 0, 1.014 4096, 64, 64, 1, 1.014 4096, 2080, 0, 0, 1.014 4096, 2080, 0, 1, 1.014 4096, 2048, 32, 0, 1.014 4096, 2048, 32, 1, 1.014 4096, 2080, 32, 0, 1.014 4096, 2080, 32, 1, 1.014 4096, 32, 1, 0, 0.975 4096, 32, 1, 1, 0.975 4096, 1, 32, 0, 0.769 4096, 1, 32, 1, 0.769 4096, 64, 1, 0, 0.858 4096, 64, 1, 1, 0.858 4096, 1, 64, 0, 0.769 4096, 1, 64, 1, 0.769 4096, 2080, 1, 0, 0.829 4096, 2080, 1, 1, 0.829 4096, 2049, 32, 0, 0.886 4096, 2049, 32, 1, 0.886 4160, 0, 0, 0, 1.003 4160, 0, 0, 1, 1.003 4160, 33, 0, 0, 1.004 4160, 33, 0, 1, 1.004 4160, 65, 0, 0, 0.999 4160, 65, 0, 1, 0.999 4160, 0, 33, 0, 0.931 4160, 0, 33, 1, 0.931 4160, 0, 65, 0, 0.765 4160, 0, 65, 1, 0.765 4160, 33, 33, 0, 0.998 4160, 33, 33, 1, 0.998 4160, 65, 65, 0, 0.942 4160, 65, 65, 1, 0.942 4160, 2048, 0, 0, 1.003 4160, 2048, 0, 1, 1.003 4160, 2081, 0, 0, 1.004 4160, 2081, 0, 1, 1.004 4160, 2048, 33, 0, 0.899 4160, 2048, 33, 1, 0.898 4160, 2081, 33, 0, 1.002 4160, 2081, 33, 1, 1.002 4160, 33, 1, 0, 1.114 4160, 33, 1, 1, 1.114 4160, 1, 33, 0, 1.01 4160, 1, 33, 1, 1.01 4160, 65, 1, 0, 1.077 4160, 65, 1, 1, 1.077 4160, 1, 65, 0, 0.935 4160, 1, 65, 1, 0.935 4160, 2081, 1, 0, 1.077 4160, 2081, 1, 1, 1.077 4160, 2049, 33, 0, 1.007 4160, 2049, 33, 1, 1.007 4224, 0, 0, 0, 1.014 4224, 0, 0, 1, 1.014 4224, 34, 0, 0, 1.0 4224, 34, 0, 1, 1.0 4224, 66, 0, 0, 1.001 4224, 66, 0, 1, 1.001 4224, 0, 34, 0, 0.928 4224, 0, 34, 1, 0.928 4224, 0, 66, 0, 0.762 4224, 0, 66, 1, 0.762 4224, 34, 34, 0, 0.998 4224, 34, 34, 1, 0.998 4224, 66, 66, 0, 0.959 4224, 66, 66, 1, 0.959 4224, 2048, 0, 0, 1.014 4224, 2048, 0, 1, 1.014 4224, 2082, 0, 0, 1.001 4224, 2082, 0, 1, 1.001 4224, 2048, 34, 0, 0.899 4224, 2048, 34, 1, 0.898 4224, 2082, 34, 0, 0.998 4224, 2082, 34, 1, 0.998 4224, 34, 1, 0, 1.024 4224, 34, 1, 1, 1.023 4224, 1, 34, 0, 0.917 4224, 1, 34, 1, 0.917 4224, 66, 1, 0, 1.012 4224, 66, 1, 1, 1.013 4224, 1, 66, 0, 0.917 4224, 1, 66, 1, 0.917 4224, 2082, 1, 0, 1.022 4224, 2082, 1, 1, 1.022 4224, 2049, 34, 0, 0.914 4224, 2049, 34, 1, 0.914 4288, 0, 0, 0, 0.999 4288, 0, 0, 1, 0.999 4288, 35, 0, 0, 0.995 4288, 35, 0, 1, 0.996 4288, 67, 0, 0, 0.998 4288, 67, 0, 1, 0.998 4288, 0, 35, 0, 0.919 4288, 0, 35, 1, 0.918 4288, 0, 67, 0, 0.767 4288, 0, 67, 1, 0.767 4288, 35, 35, 0, 1.005 4288, 35, 35, 1, 1.004 4288, 67, 67, 0, 0.995 4288, 67, 67, 1, 0.995 4288, 2048, 0, 0, 0.999 4288, 2048, 0, 1, 0.999 4288, 2083, 0, 0, 0.995 4288, 2083, 0, 1, 0.995 4288, 2048, 35, 0, 0.905 4288, 2048, 35, 1, 0.904 4288, 2083, 35, 0, 1.005 4288, 2083, 35, 1, 1.004 4288, 35, 1, 0, 1.033 4288, 35, 1, 1, 1.032 4288, 1, 35, 0, 0.928 4288, 1, 35, 1, 0.928 4288, 67, 1, 0, 1.019 4288, 67, 1, 1, 1.02 4288, 1, 67, 0, 0.925 4288, 1, 67, 1, 0.924 4288, 2083, 1, 0, 1.03 4288, 2083, 1, 1, 1.03 4288, 2049, 35, 0, 0.925 4288, 2049, 35, 1, 0.926 4352, 0, 0, 0, 1.005 4352, 0, 0, 1, 1.005 4352, 36, 0, 0, 1.007 4352, 36, 0, 1, 1.006 4352, 68, 0, 0, 1.007 4352, 68, 0, 1, 1.008 4352, 0, 36, 0, 0.929 4352, 0, 36, 1, 0.929 4352, 0, 68, 0, 0.766 4352, 0, 68, 1, 0.766 4352, 36, 36, 0, 0.998 4352, 36, 36, 1, 0.998 4352, 68, 68, 0, 0.964 4352, 68, 68, 1, 0.964 4352, 2048, 0, 0, 1.006 4352, 2048, 0, 1, 1.006 4352, 2084, 0, 0, 1.006 4352, 2084, 0, 1, 1.006 4352, 2048, 36, 0, 0.897 4352, 2048, 36, 1, 0.898 4352, 2084, 36, 0, 0.998 4352, 2084, 36, 1, 0.998 4352, 36, 1, 0, 1.031 4352, 36, 1, 1, 1.031 4352, 1, 36, 0, 0.924 4352, 1, 36, 1, 0.924 4352, 68, 1, 0, 0.999 4352, 68, 1, 1, 0.999 4352, 1, 68, 0, 0.922 4352, 1, 68, 1, 0.922 4352, 2084, 1, 0, 1.03 4352, 2084, 1, 1, 1.03 4352, 2049, 36, 0, 0.922 4352, 2049, 36, 1, 0.922 4416, 0, 0, 0, 0.997 4416, 0, 0, 1, 0.997 4416, 37, 0, 0, 1.002 4416, 37, 0, 1, 1.002 4416, 69, 0, 0, 1.004 4416, 69, 0, 1, 1.004 4416, 0, 37, 0, 0.928 4416, 0, 37, 1, 0.927 4416, 0, 69, 0, 0.762 4416, 0, 69, 1, 0.762 4416, 37, 37, 0, 0.994 4416, 37, 37, 1, 0.994 4416, 69, 69, 0, 0.959 4416, 69, 69, 1, 0.959 4416, 2048, 0, 0, 0.997 4416, 2048, 0, 1, 0.997 4416, 2085, 0, 0, 1.001 4416, 2085, 0, 1, 1.001 4416, 2048, 37, 0, 0.899 4416, 2048, 37, 1, 0.899 4416, 2085, 37, 0, 0.994 4416, 2085, 37, 1, 0.994 4416, 37, 1, 0, 1.024 4416, 37, 1, 1, 1.023 4416, 1, 37, 0, 0.923 4416, 1, 37, 1, 0.922 4416, 69, 1, 0, 1.009 4416, 69, 1, 1, 1.01 4416, 1, 69, 0, 0.917 4416, 1, 69, 1, 0.917 4416, 2085, 1, 0, 1.024 4416, 2085, 1, 1, 1.024 4416, 2049, 37, 0, 0.919 4416, 2049, 37, 1, 0.919 4480, 0, 0, 0, 1.0 4480, 0, 0, 1, 0.999 4480, 38, 0, 0, 0.996 4480, 38, 0, 1, 0.996 4480, 70, 0, 0, 1.0 4480, 70, 0, 1, 1.0 4480, 0, 38, 0, 0.919 4480, 0, 38, 1, 0.921 4480, 0, 70, 0, 0.767 4480, 0, 70, 1, 0.767 4480, 38, 38, 0, 1.002 4480, 38, 38, 1, 1.002 4480, 70, 70, 0, 0.963 4480, 70, 70, 1, 0.963 4480, 2048, 0, 0, 0.998 4480, 2048, 0, 1, 0.999 4480, 2086, 0, 0, 0.996 4480, 2086, 0, 1, 0.995 4480, 2048, 38, 0, 0.907 4480, 2048, 38, 1, 0.907 4480, 2086, 38, 0, 1.002 4480, 2086, 38, 1, 1.002 4480, 38, 1, 0, 1.032 4480, 38, 1, 1, 1.031 4480, 1, 38, 0, 0.919 4480, 1, 38, 1, 0.92 4480, 70, 1, 0, 1.018 4480, 70, 1, 1, 1.017 4480, 1, 70, 0, 0.916 4480, 1, 70, 1, 0.915 4480, 2086, 1, 0, 1.031 4480, 2086, 1, 1, 1.03 4480, 2049, 38, 0, 0.917 4480, 2049, 38, 1, 0.918 4544, 0, 0, 0, 1.002 4544, 0, 0, 1, 1.002 4544, 39, 0, 0, 1.007 4544, 39, 0, 1, 1.008 4544, 71, 0, 0, 1.002 4544, 71, 0, 1, 1.002 4544, 0, 39, 0, 0.93 4544, 0, 39, 1, 0.931 4544, 0, 71, 0, 0.766 4544, 0, 71, 1, 0.766 4544, 39, 39, 0, 1.001 4544, 39, 39, 1, 1.001 4544, 71, 71, 0, 0.966 4544, 71, 71, 1, 0.966 4544, 2048, 0, 0, 1.002 4544, 2048, 0, 1, 1.002 4544, 2087, 0, 0, 1.008 4544, 2087, 0, 1, 1.007 4544, 2048, 39, 0, 0.901 4544, 2048, 39, 1, 0.901 4544, 2087, 39, 0, 1.001 4544, 2087, 39, 1, 1.001 4544, 39, 1, 0, 1.025 4544, 39, 1, 1, 1.025 4544, 1, 39, 0, 0.919 4544, 1, 39, 1, 0.919 4544, 71, 1, 0, 0.991 4544, 71, 1, 1, 0.991 4544, 1, 71, 0, 0.921 4544, 1, 71, 1, 0.922 4544, 2087, 1, 0, 1.025 4544, 2087, 1, 1, 1.025 4544, 2049, 39, 0, 0.917 4544, 2049, 39, 1, 0.917 4608, 0, 0, 0, 0.997 4608, 0, 0, 1, 0.997 4608, 40, 0, 0, 1.013 4608, 40, 0, 1, 1.013 4608, 72, 0, 0, 1.013 4608, 72, 0, 1, 1.013 4608, 0, 40, 0, 0.925 4608, 0, 40, 1, 0.926 4608, 0, 72, 0, 0.765 4608, 0, 72, 1, 0.765 4608, 40, 40, 0, 1.084 4608, 40, 40, 1, 1.084 4608, 72, 72, 0, 0.966 4608, 72, 72, 1, 0.966 4608, 2048, 0, 0, 0.999 4608, 2048, 0, 1, 0.999 4608, 2088, 0, 0, 1.012 4608, 2088, 0, 1, 1.012 4608, 2048, 40, 0, 0.898 4608, 2048, 40, 1, 0.898 4608, 2088, 40, 0, 1.087 4608, 2088, 40, 1, 1.087 4608, 40, 1, 0, 1.006 4608, 40, 1, 1, 1.006 4608, 1, 40, 0, 0.926 4608, 1, 40, 1, 0.925 4608, 72, 1, 0, 1.012 4608, 72, 1, 1, 1.011 4608, 1, 72, 0, 0.92 4608, 1, 72, 1, 0.92 4608, 2088, 1, 0, 1.006 4608, 2088, 1, 1, 1.006 4608, 2049, 40, 0, 0.923 4608, 2049, 40, 1, 0.923 4672, 0, 0, 0, 1.014 4672, 0, 0, 1, 1.014 4672, 41, 0, 0, 1.003 4672, 41, 0, 1, 1.003 4672, 73, 0, 0, 0.983 4672, 73, 0, 1, 0.982 4672, 0, 41, 0, 0.916 4672, 0, 41, 1, 0.918 4672, 0, 73, 0, 0.772 4672, 0, 73, 1, 0.772 4672, 41, 41, 0, 1.012 4672, 41, 41, 1, 1.012 4672, 73, 73, 0, 0.973 4672, 73, 73, 1, 0.973 4672, 2048, 0, 0, 1.014 4672, 2048, 0, 1, 1.014 4672, 2089, 0, 0, 1.002 4672, 2089, 0, 1, 1.002 4672, 2048, 41, 0, 0.907 4672, 2048, 41, 1, 0.908 4672, 2089, 41, 0, 1.012 4672, 2089, 41, 1, 1.012 4672, 41, 1, 0, 1.027 4672, 41, 1, 1, 1.027 4672, 1, 41, 0, 0.928 4672, 1, 41, 1, 0.927 4672, 73, 1, 0, 1.032 4672, 73, 1, 1, 1.03 4672, 1, 73, 0, 0.927 4672, 1, 73, 1, 0.927 4672, 2089, 1, 0, 1.026 4672, 2089, 1, 1, 1.027 4672, 2049, 41, 0, 0.925 4672, 2049, 41, 1, 0.925 4736, 0, 0, 0, 1.005 4736, 0, 0, 1, 1.005 4736, 42, 0, 0, 1.012 4736, 42, 0, 1, 1.012 4736, 74, 0, 0, 0.976 4736, 74, 0, 1, 0.975 4736, 0, 42, 0, 0.93 4736, 0, 42, 1, 0.93 4736, 0, 74, 0, 0.77 4736, 0, 74, 1, 0.77 4736, 42, 42, 0, 1.007 4736, 42, 42, 1, 1.007 4736, 74, 74, 0, 0.965 4736, 74, 74, 1, 0.965 4736, 2048, 0, 0, 1.006 4736, 2048, 0, 1, 1.006 4736, 2090, 0, 0, 1.013 4736, 2090, 0, 1, 1.013 4736, 2048, 42, 0, 0.902 4736, 2048, 42, 1, 0.902 4736, 2090, 42, 0, 1.007 4736, 2090, 42, 1, 1.007 4736, 42, 1, 0, 1.032 4736, 42, 1, 1, 1.032 4736, 1, 42, 0, 0.925 4736, 1, 42, 1, 0.925 4736, 74, 1, 0, 1.018 4736, 74, 1, 1, 1.018 4736, 1, 74, 0, 0.912 4736, 1, 74, 1, 0.912 4736, 2090, 1, 0, 1.032 4736, 2090, 1, 1, 1.032 4736, 2049, 42, 0, 0.923 4736, 2049, 42, 1, 0.923 4800, 0, 0, 0, 1.012 4800, 0, 0, 1, 1.012 4800, 43, 0, 0, 1.008 4800, 43, 0, 1, 1.008 4800, 75, 0, 0, 0.99 4800, 75, 0, 1, 0.99 4800, 0, 43, 0, 0.928 4800, 0, 43, 1, 0.928 4800, 0, 75, 0, 0.767 4800, 0, 75, 1, 0.768 4800, 43, 43, 0, 1.004 4800, 43, 43, 1, 1.004 4800, 75, 75, 0, 0.965 4800, 75, 75, 1, 0.965 4800, 2048, 0, 0, 1.012 4800, 2048, 0, 1, 1.012 4800, 2091, 0, 0, 1.009 4800, 2091, 0, 1, 1.008 4800, 2048, 43, 0, 0.902 4800, 2048, 43, 1, 0.902 4800, 2091, 43, 0, 1.004 4800, 2091, 43, 1, 1.004 4800, 43, 1, 0, 1.026 4800, 43, 1, 1, 1.025 4800, 1, 43, 0, 0.91 4800, 1, 43, 1, 0.91 4800, 75, 1, 0, 0.992 4800, 75, 1, 1, 0.992 4800, 1, 75, 0, 0.921 4800, 1, 75, 1, 0.92 4800, 2091, 1, 0, 1.025 4800, 2091, 1, 1, 1.025 4800, 2049, 43, 0, 0.907 4800, 2049, 43, 1, 0.907 4864, 0, 0, 0, 0.998 4864, 0, 0, 1, 0.998 4864, 44, 0, 0, 1.003 4864, 44, 0, 1, 1.004 4864, 76, 0, 0, 0.987 4864, 76, 0, 1, 0.987 4864, 0, 44, 0, 0.92 4864, 0, 44, 1, 0.921 4864, 0, 76, 0, 0.933 4864, 0, 76, 1, 0.932 4864, 44, 44, 0, 1.006 4864, 44, 44, 1, 1.004 4864, 76, 76, 0, 0.976 4864, 76, 76, 1, 0.975 4864, 2048, 0, 0, 0.999 4864, 2048, 0, 1, 0.999 4864, 2092, 0, 0, 1.004 4864, 2092, 0, 1, 1.005 4864, 2048, 44, 0, 0.907 4864, 2048, 44, 1, 0.907 4864, 2092, 44, 0, 1.006 4864, 2092, 44, 1, 1.005 4864, 44, 1, 0, 1.034 4864, 44, 1, 1, 1.032 4864, 1, 44, 0, 0.908 4864, 1, 44, 1, 0.929 4864, 76, 1, 0, 1.006 4864, 76, 1, 1, 1.005 4864, 1, 76, 0, 0.798 4864, 1, 76, 1, 0.798 4864, 2092, 1, 0, 1.033 4864, 2092, 1, 1, 1.033 4864, 2049, 44, 0, 0.904 4864, 2049, 44, 1, 0.925 4928, 0, 0, 0, 1.005 4928, 0, 0, 1, 1.005 4928, 45, 0, 0, 0.993 4928, 45, 0, 1, 1.012 4928, 77, 0, 0, 0.956 4928, 77, 0, 1, 0.976 4928, 0, 45, 0, 0.933 4928, 0, 45, 1, 0.932 4928, 0, 77, 0, 0.771 4928, 0, 77, 1, 0.771 4928, 45, 45, 0, 1.015 4928, 45, 45, 1, 1.015 4928, 77, 77, 0, 0.972 4928, 77, 77, 1, 0.972 4928, 2048, 0, 0, 1.005 4928, 2048, 0, 1, 1.005 4928, 2093, 0, 0, 0.992 4928, 2093, 0, 1, 1.012 4928, 2048, 45, 0, 0.932 4928, 2048, 45, 1, 0.931 4928, 2093, 45, 0, 1.015 4928, 2093, 45, 1, 1.015 4928, 45, 1, 0, 1.009 4928, 45, 1, 1, 1.032 4928, 1, 45, 0, 0.806 4928, 1, 45, 1, 0.805 4928, 77, 1, 0, 0.981 4928, 77, 1, 1, 1.005 4928, 1, 77, 0, 0.917 4928, 1, 77, 1, 0.917 4928, 2093, 1, 0, 1.008 4928, 2093, 1, 1, 1.032 4928, 2049, 45, 0, 0.794 4928, 2049, 45, 1, 0.794 4992, 0, 0, 0, 0.999 4992, 0, 0, 1, 0.999 4992, 46, 0, 0, 0.985 4992, 46, 0, 1, 1.008 4992, 78, 0, 0, 0.963 4992, 78, 0, 1, 0.984 4992, 0, 46, 0, 0.908 4992, 0, 46, 1, 0.908 4992, 0, 78, 0, 0.752 4992, 0, 78, 1, 0.751 4992, 46, 46, 0, 0.997 4992, 46, 46, 1, 0.997 4992, 78, 78, 0, 0.969 4992, 78, 78, 1, 0.968 4992, 2048, 0, 0, 1.0 4992, 2048, 0, 1, 1.0 4992, 2094, 0, 0, 0.987 4992, 2094, 0, 1, 1.008 4992, 2048, 46, 0, 0.883 4992, 2048, 46, 1, 0.883 4992, 2094, 46, 0, 0.997 4992, 2094, 46, 1, 0.997 4992, 46, 1, 0, 0.998 4992, 46, 1, 1, 1.02 4992, 1, 46, 0, 0.917 4992, 1, 46, 1, 0.917 4992, 78, 1, 0, 0.972 4992, 78, 1, 1, 0.993 4992, 1, 78, 0, 0.919 4992, 1, 78, 1, 0.92 4992, 2094, 1, 0, 0.997 4992, 2094, 1, 1, 1.019 4992, 2049, 46, 0, 0.914 4992, 2049, 46, 1, 0.914 5056, 0, 0, 0, 1.002 5056, 0, 0, 1, 1.0 5056, 47, 0, 0, 1.005 5056, 47, 0, 1, 1.005 5056, 79, 0, 0, 0.989 5056, 79, 0, 1, 0.989 5056, 0, 47, 0, 0.918 5056, 0, 47, 1, 0.919 5056, 0, 79, 0, 0.772 5056, 0, 79, 1, 0.771 5056, 47, 47, 0, 1.006 5056, 47, 47, 1, 1.006 5056, 79, 79, 0, 0.972 5056, 79, 79, 1, 0.972 5056, 2048, 0, 0, 1.001 5056, 2048, 0, 1, 1.0 5056, 2095, 0, 0, 1.004 5056, 2095, 0, 1, 1.004 5056, 2048, 47, 0, 0.908 5056, 2048, 47, 1, 0.909 5056, 2095, 47, 0, 1.006 5056, 2095, 47, 1, 1.006 5056, 47, 1, 0, 1.033 5056, 47, 1, 1, 1.033 5056, 1, 47, 0, 0.919 5056, 1, 47, 1, 0.919 5056, 79, 1, 0, 1.003 5056, 79, 1, 1, 1.005 5056, 1, 79, 0, 0.921 5056, 1, 79, 1, 0.921 5056, 2095, 1, 0, 1.032 5056, 2095, 1, 1, 1.034 5056, 2049, 47, 0, 0.918 5056, 2049, 47, 1, 0.917 5120, 0, 0, 0, 1.003 5120, 0, 0, 1, 1.003 5120, 48, 0, 0, 1.068 5120, 48, 0, 1, 1.068 5120, 80, 0, 0, 1.068 5120, 80, 0, 1, 1.068 5120, 0, 48, 0, 1.065 5120, 0, 48, 1, 1.065 5120, 0, 80, 0, 1.064 5120, 0, 80, 1, 1.065 5120, 48, 48, 0, 1.004 5120, 48, 48, 1, 1.004 5120, 80, 80, 0, 1.005 5120, 80, 80, 1, 1.005 5120, 2048, 0, 0, 1.005 5120, 2048, 0, 1, 1.005 5120, 2096, 0, 0, 1.068 5120, 2096, 0, 1, 1.068 5120, 2048, 48, 0, 1.065 5120, 2048, 48, 1, 1.065 5120, 2096, 48, 0, 1.005 5120, 2096, 48, 1, 1.005 5120, 48, 1, 0, 1.033 5120, 48, 1, 1, 1.031 5120, 1, 48, 0, 0.898 5120, 1, 48, 1, 0.898 5120, 80, 1, 0, 0.844 5120, 80, 1, 1, 0.844 5120, 1, 80, 0, 0.898 5120, 1, 80, 1, 0.898 5120, 2096, 1, 0, 0.856 5120, 2096, 1, 1, 0.855 5120, 2049, 48, 0, 0.898 5120, 2049, 48, 1, 0.898 bench-memcpy-random: length, New Time / Old Time 32768, 0.866 65536, 0.891 131072, 0.896 262144, 0.901 524288, 0.904 1048576, 0.913 bench-memcpy-large: length, align0, align1, dst>src, New Time/Old Time 65543, 0, 0, 0, 0.981 65543, 0, 0, 1, 0.981 65551, 0, 3, 0, 1.012 65551, 0, 3, 1, 1.013 65567, 3, 0, 0, 1.019 65567, 3, 0, 1, 1.02 65599, 3, 5, 0, 1.058 65599, 3, 5, 1, 1.061 65536, 0, 127, 0, 1.046 65536, 0, 127, 1, 1.046 65536, 0, 255, 0, 1.071 65536, 0, 255, 1, 1.071 65536, 0, 256, 0, 0.983 65536, 0, 256, 1, 0.984 65536, 0, 4064, 0, 1.017 65536, 0, 4064, 1, 1.018 131079, 0, 0, 0, 0.981 131079, 0, 0, 1, 0.981 131087, 0, 3, 0, 1.017 131087, 0, 3, 1, 1.017 131103, 3, 0, 0, 1.022 131103, 3, 0, 1, 1.022 131135, 3, 5, 0, 1.064 131135, 3, 5, 1, 1.065 131072, 0, 127, 0, 1.05 131072, 0, 127, 1, 1.05 131072, 0, 255, 0, 1.074 131072, 0, 255, 1, 1.074 131072, 0, 256, 0, 0.984 131072, 0, 256, 1, 0.984 131072, 0, 4064, 0, 1.018 131072, 0, 4064, 1, 1.019 262151, 0, 0, 0, 0.985 262151, 0, 0, 1, 0.985 262159, 0, 3, 0, 1.026 262159, 0, 3, 1, 1.026 262175, 3, 0, 0, 1.03 262175, 3, 0, 1, 1.03 262207, 3, 5, 0, 1.07 262207, 3, 5, 1, 1.07 262144, 0, 127, 0, 1.057 262144, 0, 127, 1, 1.057 262144, 0, 255, 0, 1.079 262144, 0, 255, 1, 1.078 262144, 0, 256, 0, 0.988 262144, 0, 256, 1, 0.988 262144, 0, 4064, 0, 1.02 262144, 0, 4064, 1, 1.02 524295, 0, 0, 0, 0.692 524295, 0, 0, 1, 0.692 524303, 0, 3, 0, 0.736 524303, 0, 3, 1, 0.737 524319, 3, 0, 0, 0.758 524319, 3, 0, 1, 0.759 524351, 3, 5, 0, 0.759 524351, 3, 5, 1, 0.759 524288, 0, 127, 0, 1.057 524288, 0, 127, 1, 1.058 524288, 0, 255, 0, 1.079 524288, 0, 255, 1, 1.079 524288, 0, 256, 0, 0.988 524288, 0, 256, 1, 0.988 524288, 0, 4064, 0, 1.02 524288, 0, 4064, 1, 1.02 1048583, 0, 0, 0, 0.948 1048583, 0, 0, 1, 0.948 1048591, 0, 3, 0, 0.735 1048591, 0, 3, 1, 0.735 1048607, 3, 0, 0, 0.757 1048607, 3, 0, 1, 0.758 1048639, 3, 5, 0, 0.758 1048639, 3, 5, 1, 0.758 1048576, 0, 127, 0, 0.761 1048576, 0, 127, 1, 0.762 1048576, 0, 255, 0, 0.751 1048576, 0, 255, 1, 0.751 1048576, 0, 256, 0, 0.93 1048576, 0, 256, 1, 0.93 1048576, 0, 4064, 0, 0.93 1048576, 0, 4064, 1, 0.93 2097159, 0, 0, 0, 0.928 2097159, 0, 0, 1, 0.931 2097167, 0, 3, 0, 0.735 2097167, 0, 3, 1, 0.734 2097183, 3, 0, 0, 0.759 2097183, 3, 0, 1, 0.759 2097215, 3, 5, 0, 0.758 2097215, 3, 5, 1, 0.757 2097152, 0, 127, 0, 0.77 2097152, 0, 127, 1, 0.77 2097152, 0, 255, 0, 0.745 2097152, 0, 255, 1, 0.745 2097152, 0, 256, 0, 0.924 2097152, 0, 256, 1, 0.925 2097152, 0, 4064, 0, 0.926 2097152, 0, 4064, 1, 0.927 4194311, 0, 0, 0, 0.894 4194311, 0, 0, 1, 0.896 4194319, 0, 3, 0, 0.752 4194319, 0, 3, 1, 0.751 4194335, 3, 0, 0, 0.82 4194335, 3, 0, 1, 0.821 4194367, 3, 5, 0, 0.788 4194367, 3, 5, 1, 0.789 4194304, 0, 127, 0, 0.801 4194304, 0, 127, 1, 0.801 4194304, 0, 255, 0, 0.802 4194304, 0, 255, 1, 0.804 4194304, 0, 256, 0, 0.873 4194304, 0, 256, 1, 0.868 4194304, 0, 4064, 0, 0.955 4194304, 0, 4064, 1, 0.954 8388615, 0, 0, 0, 0.885 8388615, 0, 0, 1, 0.886 8388623, 0, 3, 0, 0.769 8388623, 0, 3, 1, 0.769 8388639, 3, 0, 0, 0.87 8388639, 3, 0, 1, 0.87 8388671, 3, 5, 0, 0.811 8388671, 3, 5, 1, 0.814 8388608, 0, 127, 0, 0.83 8388608, 0, 127, 1, 0.83 8388608, 0, 255, 0, 0.857 8388608, 0, 255, 1, 0.857 8388608, 0, 256, 0, 0.851 8388608, 0, 256, 1, 0.848 8388608, 0, 4064, 0, 0.981 8388608, 0, 4064, 1, 0.981 16777223, 0, 0, 0, 0.885 16777223, 0, 0, 1, 0.886 16777231, 0, 3, 0, 0.769 16777231, 0, 3, 1, 0.768 16777247, 3, 0, 0, 0.87 16777247, 3, 0, 1, 0.87 16777279, 3, 5, 0, 0.811 16777279, 3, 5, 1, 0.814 16777216, 0, 127, 0, 0.831 16777216, 0, 127, 1, 0.83 16777216, 0, 255, 0, 0.857 16777216, 0, 255, 1, 0.857 16777216, 0, 256, 0, 0.852 16777216, 0, 256, 1, 0.848 16777216, 0, 4064, 0, 0.98 16777216, 0, 4064, 1, 0.981 33554439, 0, 0, 0, 0.885 33554439, 0, 0, 1, 0.886 33554447, 0, 3, 0, 0.768 33554447, 0, 3, 1, 0.768 33554463, 3, 0, 0, 0.871 33554463, 3, 0, 1, 0.87 33554495, 3, 5, 0, 0.811 33554495, 3, 5, 1, 0.814 33554432, 0, 127, 0, 0.831 33554432, 0, 127, 1, 0.831 33554432, 0, 255, 0, 0.858 33554432, 0, 255, 1, 0.857 33554432, 0, 256, 0, 0.852 33554432, 0, 256, 1, 0.848 33554432, 0, 4064, 0, 0.98 33554432, 0, 4064, 1, 0.981 sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - 6 files changed, 3572 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2b3c625ea2..5b02ec8de5 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -46,13 +46,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -83,7 +81,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 41a04621ad..49ce6860d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3 2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > Full memcpy ssse3 results. Number are comparison of > geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz > > bench-memcpy: > > length, align1, align2, dst > src, New Time / Old Time > 1, 0, 0, 0, 2.099 > 1, 0, 0, 1, 2.099 > 1, 32, 0, 0, 2.103 > 1, 32, 0, 1, 2.103 > 1, 0, 32, 0, 2.099 > 1, 0, 32, 1, 2.098 > 1, 32, 32, 0, 2.098 > 1, 32, 32, 1, 2.098 > 1, 2048, 0, 0, 2.098 > 1, 2048, 0, 1, 2.098 > 2, 0, 0, 0, 1.135 > 2, 0, 0, 1, 1.136 > 2, 1, 0, 0, 1.139 > 2, 1, 0, 1, 1.139 > 2, 33, 0, 0, 1.165 > 2, 33, 0, 1, 1.139 > 2, 0, 1, 0, 1.136 > 2, 0, 1, 1, 1.136 > 2, 0, 33, 0, 1.136 > 2, 0, 33, 1, 1.136 > 2, 1, 1, 0, 1.136 > 2, 1, 1, 1, 1.136 > 2, 33, 33, 0, 1.136 > 2, 33, 33, 1, 1.136 > 2, 2048, 0, 0, 1.136 > 2, 2048, 0, 1, 1.136 > 2, 2049, 0, 0, 1.191 > 2, 2049, 0, 1, 1.139 > 2, 2048, 1, 0, 1.136 > 2, 2048, 1, 1, 1.136 > 2, 2049, 1, 0, 1.136 > 2, 2049, 1, 1, 1.136 > 4, 0, 0, 0, 1.074 > 4, 0, 0, 1, 0.962 > 4, 2, 0, 0, 0.973 > 4, 2, 0, 1, 0.989 > 4, 34, 0, 0, 0.991 > 4, 34, 0, 1, 0.991 > 4, 0, 2, 0, 0.962 > 4, 0, 2, 1, 0.962 > 4, 0, 34, 0, 0.962 > 4, 0, 34, 1, 0.962 > 4, 2, 2, 0, 0.962 > 4, 2, 2, 1, 0.962 > 4, 34, 34, 0, 0.962 > 4, 34, 34, 1, 0.962 > 4, 2048, 0, 0, 0.962 > 4, 2048, 0, 1, 0.962 > 4, 2050, 0, 0, 0.977 > 4, 2050, 0, 1, 0.979 > 4, 2048, 2, 0, 0.962 > 4, 2048, 2, 1, 0.962 > 4, 2050, 2, 0, 0.962 > 4, 2050, 2, 1, 0.962 > 8, 0, 0, 0, 0.961 > 8, 0, 0, 1, 0.962 > 8, 3, 0, 0, 1.0 > 8, 3, 0, 1, 1.0 > 8, 35, 0, 0, 1.0 > 8, 35, 0, 1, 1.0 > 8, 0, 3, 0, 0.962 > 8, 0, 3, 1, 0.962 > 8, 0, 35, 0, 0.962 > 8, 0, 35, 1, 0.962 > 8, 3, 3, 0, 0.962 > 8, 3, 3, 1, 0.962 > 8, 35, 35, 0, 0.962 > 8, 35, 35, 1, 0.962 > 8, 2048, 0, 0, 0.962 > 8, 2048, 0, 1, 0.962 > 8, 2051, 0, 0, 1.0 > 8, 2051, 0, 1, 1.0 > 8, 2048, 3, 0, 0.962 > 8, 2048, 3, 1, 0.962 > 8, 2051, 3, 0, 0.962 > 8, 2051, 3, 1, 0.962 > 16, 0, 0, 0, 0.798 > 16, 0, 0, 1, 0.799 > 16, 4, 0, 0, 0.8 > 16, 4, 0, 1, 0.801 > 16, 36, 0, 0, 0.801 > 16, 36, 0, 1, 0.8 > 16, 0, 4, 0, 0.798 > 16, 0, 4, 1, 0.798 > 16, 0, 36, 0, 0.798 > 16, 0, 36, 1, 0.798 > 16, 4, 4, 0, 0.798 > 16, 4, 4, 1, 0.798 > 16, 36, 36, 0, 0.798 > 16, 36, 36, 1, 0.798 > 16, 2048, 0, 0, 0.798 > 16, 2048, 0, 1, 0.799 > 16, 2052, 0, 0, 0.8 > 16, 2052, 0, 1, 0.8 > 16, 2048, 4, 0, 0.798 > 16, 2048, 4, 1, 0.798 > 16, 2052, 4, 0, 0.798 > 16, 2052, 4, 1, 0.798 > 32, 0, 0, 0, 0.471 > 32, 0, 0, 1, 0.471 > 32, 5, 0, 0, 0.471 > 32, 5, 0, 1, 0.471 > 32, 37, 0, 0, 0.961 > 32, 37, 0, 1, 0.961 > 32, 0, 5, 0, 0.471 > 32, 0, 5, 1, 0.471 > 32, 0, 37, 0, 1.021 > 32, 0, 37, 1, 1.021 > 32, 5, 5, 0, 0.471 > 32, 5, 5, 1, 0.471 > 32, 37, 37, 0, 1.011 > 32, 37, 37, 1, 1.011 > 32, 2048, 0, 0, 0.471 > 32, 2048, 0, 1, 0.471 > 32, 2053, 0, 0, 0.471 > 32, 2053, 0, 1, 0.471 > 32, 2048, 5, 0, 0.471 > 32, 2048, 5, 1, 0.471 > 32, 2053, 5, 0, 0.471 > 32, 2053, 5, 1, 0.471 > 64, 0, 0, 0, 1.0 > 64, 0, 0, 1, 1.0 > 64, 6, 0, 0, 0.862 > 64, 6, 0, 1, 0.862 > 64, 38, 0, 0, 0.912 > 64, 38, 0, 1, 0.912 > 64, 0, 6, 0, 0.896 > 64, 0, 6, 1, 0.896 > 64, 0, 38, 0, 0.906 > 64, 0, 38, 1, 0.906 > 64, 6, 6, 0, 0.91 > 64, 6, 6, 1, 0.91 > 64, 38, 38, 0, 0.883 > 64, 38, 38, 1, 0.883 > 64, 2048, 0, 0, 1.0 > 64, 2048, 0, 1, 1.0 > 64, 2054, 0, 0, 0.862 > 64, 2054, 0, 1, 0.862 > 64, 2048, 6, 0, 0.887 > 64, 2048, 6, 1, 0.887 > 64, 2054, 6, 0, 0.887 > 64, 2054, 6, 1, 0.887 > 128, 0, 0, 0, 0.857 > 128, 0, 0, 1, 0.857 > 128, 7, 0, 0, 0.875 > 128, 7, 0, 1, 0.875 > 128, 39, 0, 0, 0.892 > 128, 39, 0, 1, 0.892 > 128, 0, 7, 0, 1.183 > 128, 0, 7, 1, 1.183 > 128, 0, 39, 0, 1.113 > 128, 0, 39, 1, 1.113 > 128, 7, 7, 0, 0.692 > 128, 7, 7, 1, 0.692 > 128, 39, 39, 0, 1.104 > 128, 39, 39, 1, 1.104 > 128, 2048, 0, 0, 0.857 > 128, 2048, 0, 1, 0.857 > 128, 2055, 0, 0, 0.875 > 128, 2055, 0, 1, 0.875 > 128, 2048, 7, 0, 0.959 > 128, 2048, 7, 1, 0.959 > 128, 2055, 7, 0, 1.036 > 128, 2055, 7, 1, 1.036 > 256, 0, 0, 0, 0.889 > 256, 0, 0, 1, 0.889 > 256, 8, 0, 0, 0.966 > 256, 8, 0, 1, 0.966 > 256, 40, 0, 0, 0.983 > 256, 40, 0, 1, 0.983 > 256, 0, 8, 0, 1.29 > 256, 0, 8, 1, 1.29 > 256, 0, 40, 0, 1.274 > 256, 0, 40, 1, 1.274 > 256, 8, 8, 0, 0.865 > 256, 8, 8, 1, 0.865 > 256, 40, 40, 0, 1.477 > 256, 40, 40, 1, 1.477 > 256, 2048, 0, 0, 0.889 > 256, 2048, 0, 1, 0.889 > 256, 2056, 0, 0, 0.966 > 256, 2056, 0, 1, 0.966 > 256, 2048, 8, 0, 0.952 > 256, 2048, 8, 1, 0.952 > 256, 2056, 8, 0, 0.878 > 256, 2056, 8, 1, 0.878 > 512, 0, 0, 0, 1.077 > 512, 0, 0, 1, 1.077 > 512, 9, 0, 0, 1.001 > 512, 9, 0, 1, 1.0 > 512, 41, 0, 0, 0.954 > 512, 41, 0, 1, 0.954 > 512, 0, 9, 0, 1.191 > 512, 0, 9, 1, 1.191 > 512, 0, 41, 0, 1.181 > 512, 0, 41, 1, 1.181 > 512, 9, 9, 0, 0.765 > 512, 9, 9, 1, 0.765 > 512, 41, 41, 0, 0.905 > 512, 41, 41, 1, 0.905 > 512, 2048, 0, 0, 1.077 > 512, 2048, 0, 1, 1.077 > 512, 2057, 0, 0, 1.0 > 512, 2057, 0, 1, 1.0 > 512, 2048, 9, 0, 1.0 > 512, 2048, 9, 1, 1.0 > 512, 2057, 9, 0, 0.733 > 512, 2057, 9, 1, 0.733 > 1024, 0, 0, 0, 1.143 > 1024, 0, 0, 1, 1.143 > 1024, 10, 0, 0, 1.015 > 1024, 10, 0, 1, 1.015 > 1024, 42, 0, 0, 1.045 > 1024, 42, 0, 1, 1.045 > 1024, 0, 10, 0, 1.126 > 1024, 0, 10, 1, 1.126 > 1024, 0, 42, 0, 1.114 > 1024, 0, 42, 1, 1.114 > 1024, 10, 10, 0, 0.89 > 1024, 10, 10, 1, 0.89 > 1024, 42, 42, 0, 0.986 > 1024, 42, 42, 1, 0.986 > 1024, 2048, 0, 0, 1.143 > 1024, 2048, 0, 1, 1.143 > 1024, 2058, 0, 0, 1.015 > 1024, 2058, 0, 1, 1.015 > 1024, 2048, 10, 0, 1.03 > 1024, 2048, 10, 1, 1.03 > 1024, 2058, 10, 0, 0.854 > 1024, 2058, 10, 1, 0.854 > 2048, 0, 0, 0, 1.005 > 2048, 0, 0, 1, 1.005 > 2048, 11, 0, 0, 1.013 > 2048, 11, 0, 1, 1.014 > 2048, 43, 0, 0, 1.044 > 2048, 43, 0, 1, 1.044 > 2048, 0, 11, 0, 1.003 > 2048, 0, 11, 1, 1.003 > 2048, 0, 43, 0, 1.003 > 2048, 0, 43, 1, 1.003 > 2048, 11, 11, 0, 0.92 > 2048, 11, 11, 1, 0.92 > 2048, 43, 43, 0, 1.0 > 2048, 43, 43, 1, 1.0 > 2048, 2048, 0, 0, 1.005 > 2048, 2048, 0, 1, 1.005 > 2048, 2059, 0, 0, 0.904 > 2048, 2059, 0, 1, 0.904 > 2048, 2048, 11, 0, 1.0 > 2048, 2048, 11, 1, 1.0 > 2048, 2059, 11, 0, 0.979 > 2048, 2059, 11, 1, 0.979 > 4096, 0, 0, 0, 1.014 > 4096, 0, 0, 1, 1.014 > 4096, 12, 0, 0, 0.855 > 4096, 12, 0, 1, 0.855 > 4096, 44, 0, 0, 0.857 > 4096, 44, 0, 1, 0.857 > 4096, 0, 12, 0, 0.932 > 4096, 0, 12, 1, 0.932 > 4096, 0, 44, 0, 0.932 > 4096, 0, 44, 1, 0.932 > 4096, 12, 12, 0, 0.999 > 4096, 12, 12, 1, 0.999 > 4096, 44, 44, 0, 1.051 > 4096, 44, 44, 1, 1.051 > 4096, 2048, 0, 0, 1.014 > 4096, 2048, 0, 1, 1.014 > 4096, 2060, 0, 0, 0.98 > 4096, 2060, 0, 1, 0.98 > 4096, 2048, 12, 0, 0.77 > 4096, 2048, 12, 1, 0.77 > 4096, 2060, 12, 0, 0.943 > 4096, 2060, 12, 1, 0.943 > 8192, 0, 0, 0, 1.046 > 8192, 0, 0, 1, 1.046 > 8192, 13, 0, 0, 0.885 > 8192, 13, 0, 1, 0.885 > 8192, 45, 0, 0, 0.887 > 8192, 45, 0, 1, 0.886 > 8192, 0, 13, 0, 0.942 > 8192, 0, 13, 1, 0.942 > 8192, 0, 45, 0, 0.942 > 8192, 0, 45, 1, 0.942 > 8192, 13, 13, 0, 1.03 > 8192, 13, 13, 1, 1.03 > 8192, 45, 45, 0, 1.048 > 8192, 45, 45, 1, 1.048 > 8192, 2048, 0, 0, 1.048 > 8192, 2048, 0, 1, 1.048 > 8192, 2061, 0, 0, 1.011 > 8192, 2061, 0, 1, 1.011 > 8192, 2048, 13, 0, 0.789 > 8192, 2048, 13, 1, 0.789 > 8192, 2061, 13, 0, 0.991 > 8192, 2061, 13, 1, 0.991 > 16384, 0, 0, 0, 1.014 > 16384, 0, 0, 1, 1.008 > 16384, 14, 0, 0, 0.951 > 16384, 14, 0, 1, 0.95 > 16384, 46, 0, 0, 0.874 > 16384, 46, 0, 1, 0.871 > 16384, 0, 14, 0, 0.813 > 16384, 0, 14, 1, 0.81 > 16384, 0, 46, 0, 0.85 > 16384, 0, 46, 1, 0.86 > 16384, 14, 14, 0, 0.985 > 16384, 14, 14, 1, 0.975 > 16384, 46, 46, 0, 1.025 > 16384, 46, 46, 1, 1.027 > 16384, 2048, 0, 0, 1.058 > 16384, 2048, 0, 1, 1.058 > 16384, 2062, 0, 0, 0.849 > 16384, 2062, 0, 1, 0.848 > 16384, 2048, 14, 0, 0.907 > 16384, 2048, 14, 1, 0.907 > 16384, 2062, 14, 0, 0.988 > 16384, 2062, 14, 1, 0.995 > 32768, 0, 0, 0, 0.979 > 32768, 0, 0, 1, 0.979 > 32768, 15, 0, 0, 1.006 > 32768, 15, 0, 1, 1.006 > 32768, 47, 0, 0, 1.004 > 32768, 47, 0, 1, 1.004 > 32768, 0, 15, 0, 1.045 > 32768, 0, 15, 1, 1.045 > 32768, 0, 47, 0, 1.011 > 32768, 0, 47, 1, 1.012 > 32768, 15, 15, 0, 0.977 > 32768, 15, 15, 1, 0.977 > 32768, 47, 47, 0, 0.96 > 32768, 47, 47, 1, 0.96 > 32768, 2048, 0, 0, 0.978 > 32768, 2048, 0, 1, 0.978 > 32768, 2063, 0, 0, 1.004 > 32768, 2063, 0, 1, 1.004 > 32768, 2048, 15, 0, 1.036 > 32768, 2048, 15, 1, 1.036 > 32768, 2063, 15, 0, 0.978 > 32768, 2063, 15, 1, 0.978 > 65536, 0, 0, 0, 0.981 > 65536, 0, 0, 1, 0.981 > 65536, 16, 0, 0, 0.987 > 65536, 16, 0, 1, 0.987 > 65536, 48, 0, 0, 0.968 > 65536, 48, 0, 1, 0.968 > 65536, 0, 16, 0, 1.014 > 65536, 0, 16, 1, 1.014 > 65536, 0, 48, 0, 0.984 > 65536, 0, 48, 1, 0.984 > 65536, 16, 16, 0, 1.01 > 65536, 16, 16, 1, 1.01 > 65536, 48, 48, 0, 0.968 > 65536, 48, 48, 1, 0.968 > 65536, 2048, 0, 0, 0.982 > 65536, 2048, 0, 1, 0.982 > 65536, 2064, 0, 0, 0.987 > 65536, 2064, 0, 1, 0.987 > 65536, 2048, 16, 0, 1.012 > 65536, 2048, 16, 1, 1.012 > 65536, 2064, 16, 0, 1.007 > 65536, 2064, 16, 1, 1.007 > 0, 0, 0, 0, 2.104 > 0, 2048, 0, 0, 2.104 > 0, 4095, 0, 0, 2.109 > 0, 0, 4095, 0, 2.103 > 1, 1, 0, 0, 2.104 > 1, 0, 1, 0, 2.098 > 1, 1, 1, 0, 2.098 > 1, 2049, 0, 0, 2.102 > 1, 2048, 1, 0, 2.098 > 1, 2049, 1, 0, 2.098 > 1, 4095, 0, 0, 2.103 > 1, 0, 4095, 0, 2.098 > 2, 2, 0, 0, 1.139 > 2, 0, 2, 0, 1.136 > 2, 2, 2, 0, 1.136 > 2, 2050, 0, 0, 1.139 > 2, 2048, 2, 0, 1.136 > 2, 2050, 2, 0, 1.136 > 2, 4095, 0, 0, 1.0 > 2, 0, 4095, 0, 1.022 > 3, 0, 0, 0, 0.981 > 3, 3, 0, 0, 0.984 > 3, 0, 3, 0, 0.982 > 3, 3, 3, 0, 0.982 > 3, 2048, 0, 0, 0.982 > 3, 2051, 0, 0, 0.983 > 3, 2048, 3, 0, 0.982 > 3, 2051, 3, 0, 0.982 > 3, 4095, 0, 0, 0.285 > 3, 0, 4095, 0, 0.231 > 4, 4, 0, 0, 1.373 > 4, 0, 4, 0, 1.31 > 4, 4, 4, 0, 1.282 > 4, 2052, 0, 0, 1.264 > 4, 2048, 4, 0, 1.254 > 4, 2052, 4, 0, 1.254 > 4, 4095, 0, 0, 1.971 > 4, 0, 4095, 0, 1.994 > 5, 0, 0, 0, 1.145 > 5, 5, 0, 0, 1.155 > 5, 0, 5, 0, 1.171 > 5, 5, 5, 0, 1.171 > 5, 2048, 0, 0, 1.197 > 5, 2053, 0, 0, 1.173 > 5, 2048, 5, 0, 1.171 > 5, 2053, 5, 0, 1.171 > 5, 4095, 0, 0, 0.935 > 5, 0, 4095, 0, 1.017 > 6, 0, 0, 0, 1.145 > 6, 6, 0, 0, 1.098 > 6, 0, 6, 0, 1.096 > 6, 6, 6, 0, 1.096 > 6, 2048, 0, 0, 1.12 > 6, 2054, 0, 0, 1.122 > 6, 2048, 6, 0, 1.12 > 6, 2054, 6, 0, 1.096 > 6, 4095, 0, 0, 0.935 > 6, 0, 4095, 0, 1.018 > 7, 0, 0, 0, 1.071 > 7, 7, 0, 0, 1.074 > 7, 0, 7, 0, 1.072 > 7, 7, 7, 0, 1.072 > 7, 2048, 0, 0, 1.096 > 7, 2055, 0, 0, 1.098 > 7, 2048, 7, 0, 1.096 > 7, 2055, 7, 0, 1.096 > 7, 4095, 0, 0, 0.935 > 7, 0, 4095, 0, 1.016 > 8, 8, 0, 0, 1.167 > 8, 0, 8, 0, 1.028 > 8, 8, 8, 0, 1.028 > 8, 2056, 0, 0, 1.069 > 8, 2048, 8, 0, 1.028 > 8, 2056, 8, 0, 1.028 > 8, 4095, 0, 0, 1.029 > 8, 0, 4095, 0, 1.043 > 9, 0, 0, 0, 0.799 > 9, 9, 0, 0, 0.801 > 9, 0, 9, 0, 0.799 > 9, 9, 9, 0, 0.799 > 9, 2048, 0, 0, 0.8 > 9, 2057, 0, 0, 0.801 > 9, 2048, 9, 0, 0.8 > 9, 2057, 9, 0, 0.799 > 9, 4095, 0, 0, 0.909 > 9, 0, 4095, 0, 1.0 > 10, 0, 0, 0, 0.799 > 10, 10, 0, 0, 0.801 > 10, 0, 10, 0, 0.8 > 10, 10, 10, 0, 0.8 > 10, 2048, 0, 0, 0.8 > 10, 2058, 0, 0, 0.801 > 10, 2048, 10, 0, 0.8 > 10, 2058, 10, 0, 0.8 > 10, 4095, 0, 0, 0.909 > 10, 0, 4095, 0, 1.0 > 11, 0, 0, 0, 0.799 > 11, 11, 0, 0, 0.801 > 11, 0, 11, 0, 0.8 > 11, 11, 11, 0, 0.8 > 11, 2048, 0, 0, 0.8 > 11, 2059, 0, 0, 0.802 > 11, 2048, 11, 0, 0.8 > 11, 2059, 11, 0, 0.8 > 11, 4095, 0, 0, 0.909 > 11, 0, 4095, 0, 1.0 > 12, 0, 0, 0, 0.799 > 12, 12, 0, 0, 0.801 > 12, 0, 12, 0, 0.8 > 12, 12, 12, 0, 0.8 > 12, 2048, 0, 0, 0.8 > 12, 2060, 0, 0, 0.802 > 12, 2048, 12, 0, 0.8 > 12, 2060, 12, 0, 0.8 > 12, 4095, 0, 0, 0.909 > 12, 0, 4095, 0, 1.0 > 13, 0, 0, 0, 0.798 > 13, 13, 0, 0, 0.801 > 13, 0, 13, 0, 0.799 > 13, 13, 13, 0, 0.799 > 13, 2048, 0, 0, 0.8 > 13, 2061, 0, 0, 0.801 > 13, 2048, 13, 0, 0.8 > 13, 2061, 13, 0, 0.8 > 13, 4095, 0, 0, 0.909 > 13, 0, 4095, 0, 1.0 > 14, 0, 0, 0, 0.799 > 14, 14, 0, 0, 0.801 > 14, 0, 14, 0, 0.8 > 14, 14, 14, 0, 0.8 > 14, 2048, 0, 0, 0.8 > 14, 2062, 0, 0, 0.801 > 14, 2048, 14, 0, 0.8 > 14, 2062, 14, 0, 0.8 > 14, 4095, 0, 0, 0.909 > 14, 0, 4095, 0, 1.0 > 15, 0, 0, 0, 0.799 > 15, 15, 0, 0, 0.801 > 15, 0, 15, 0, 0.8 > 15, 15, 15, 0, 0.8 > 15, 2048, 0, 0, 0.8 > 15, 2063, 0, 0, 0.802 > 15, 2048, 15, 0, 0.8 > 15, 2063, 15, 0, 0.8 > 15, 4095, 0, 0, 0.909 > 15, 0, 4095, 0, 1.0 > 16, 16, 0, 0, 0.801 > 16, 0, 16, 0, 0.799 > 16, 16, 16, 0, 0.799 > 16, 2064, 0, 0, 0.801 > 16, 2048, 16, 0, 0.798 > 16, 2064, 16, 0, 0.798 > 16, 4095, 0, 0, 1.818 > 16, 0, 4095, 0, 1.957 > 17, 0, 0, 0, 0.798 > 17, 17, 0, 0, 0.8 > 17, 0, 17, 0, 0.799 > 17, 17, 17, 0, 0.798 > 17, 2048, 0, 0, 0.798 > 17, 2065, 0, 0, 0.8 > 17, 2048, 17, 0, 0.798 > 17, 2065, 17, 0, 0.799 > 17, 4095, 0, 0, 0.937 > 17, 0, 4095, 0, 1.021 > 18, 0, 0, 0, 0.798 > 18, 18, 0, 0, 0.801 > 18, 0, 18, 0, 0.798 > 18, 18, 18, 0, 0.798 > 18, 2048, 0, 0, 0.799 > 18, 2066, 0, 0, 0.8 > 18, 2048, 18, 0, 0.798 > 18, 2066, 18, 0, 0.798 > 18, 4095, 0, 0, 0.937 > 18, 0, 4095, 0, 1.021 > 19, 0, 0, 0, 0.798 > 19, 19, 0, 0, 0.8 > 19, 0, 19, 0, 0.798 > 19, 19, 19, 0, 0.798 > 19, 2048, 0, 0, 0.798 > 19, 2067, 0, 0, 0.8 > 19, 2048, 19, 0, 0.798 > 19, 2067, 19, 0, 0.798 > 19, 4095, 0, 0, 0.937 > 19, 0, 4095, 0, 1.021 > 20, 0, 0, 0, 0.798 > 20, 20, 0, 0, 0.8 > 20, 0, 20, 0, 0.798 > 20, 20, 20, 0, 0.798 > 20, 2048, 0, 0, 0.798 > 20, 2068, 0, 0, 0.8 > 20, 2048, 20, 0, 0.798 > 20, 2068, 20, 0, 0.798 > 20, 4095, 0, 0, 0.937 > 20, 0, 4095, 0, 1.021 > 21, 0, 0, 0, 0.798 > 21, 21, 0, 0, 0.801 > 21, 0, 21, 0, 0.798 > 21, 21, 21, 0, 0.798 > 21, 2048, 0, 0, 0.798 > 21, 2069, 0, 0, 0.801 > 21, 2048, 21, 0, 0.799 > 21, 2069, 21, 0, 0.798 > 21, 4095, 0, 0, 0.937 > 21, 0, 4095, 0, 1.021 > 22, 0, 0, 0, 0.798 > 22, 22, 0, 0, 0.801 > 22, 0, 22, 0, 0.798 > 22, 22, 22, 0, 0.798 > 22, 2048, 0, 0, 0.798 > 22, 2070, 0, 0, 0.801 > 22, 2048, 22, 0, 0.798 > 22, 2070, 22, 0, 0.798 > 22, 4095, 0, 0, 0.937 > 22, 0, 4095, 0, 1.021 > 23, 0, 0, 0, 0.798 > 23, 23, 0, 0, 0.8 > 23, 0, 23, 0, 0.798 > 23, 23, 23, 0, 0.798 > 23, 2048, 0, 0, 0.798 > 23, 2071, 0, 0, 0.8 > 23, 2048, 23, 0, 0.798 > 23, 2071, 23, 0, 0.798 > 23, 4095, 0, 0, 0.937 > 23, 0, 4095, 0, 1.021 > 24, 0, 0, 0, 0.798 > 24, 24, 0, 0, 0.8 > 24, 0, 24, 0, 0.799 > 24, 24, 24, 0, 0.798 > 24, 2048, 0, 0, 0.798 > 24, 2072, 0, 0, 0.801 > 24, 2048, 24, 0, 0.798 > 24, 2072, 24, 0, 0.798 > 24, 4095, 0, 0, 0.937 > 24, 0, 4095, 0, 1.021 > 25, 0, 0, 0, 0.5 > 25, 25, 0, 0, 0.5 > 25, 0, 25, 0, 0.5 > 25, 25, 25, 0, 0.5 > 25, 2048, 0, 0, 0.5 > 25, 2073, 0, 0, 0.501 > 25, 2048, 25, 0, 0.5 > 25, 2073, 25, 0, 0.5 > 25, 4095, 0, 0, 0.974 > 25, 0, 4095, 0, 0.98 > 26, 0, 0, 0, 0.5 > 26, 26, 0, 0, 0.501 > 26, 0, 26, 0, 0.5 > 26, 26, 26, 0, 0.501 > 26, 2048, 0, 0, 0.5 > 26, 2074, 0, 0, 0.5 > 26, 2048, 26, 0, 0.5 > 26, 2074, 26, 0, 0.5 > 26, 4095, 0, 0, 0.974 > 26, 0, 4095, 0, 1.0 > 27, 0, 0, 0, 0.5 > 27, 27, 0, 0, 0.501 > 27, 0, 27, 0, 0.5 > 27, 27, 27, 0, 0.5 > 27, 2048, 0, 0, 0.5 > 27, 2075, 0, 0, 0.5 > 27, 2048, 27, 0, 0.5 > 27, 2075, 27, 0, 0.5 > 27, 4095, 0, 0, 0.974 > 27, 0, 4095, 0, 1.0 > 28, 0, 0, 0, 0.5 > 28, 28, 0, 0, 0.501 > 28, 0, 28, 0, 0.5 > 28, 28, 28, 0, 0.5 > 28, 2048, 0, 0, 0.5 > 28, 2076, 0, 0, 0.5 > 28, 2048, 28, 0, 0.5 > 28, 2076, 28, 0, 0.5 > 28, 4095, 0, 0, 0.974 > 28, 0, 4095, 0, 1.0 > 29, 0, 0, 0, 0.471 > 29, 29, 0, 0, 0.471 > 29, 0, 29, 0, 0.471 > 29, 29, 29, 0, 0.471 > 29, 2048, 0, 0, 0.471 > 29, 2077, 0, 0, 0.471 > 29, 2048, 29, 0, 0.471 > 29, 2077, 29, 0, 0.471 > 29, 4095, 0, 0, 0.974 > 29, 0, 4095, 0, 1.0 > 30, 0, 0, 0, 0.471 > 30, 30, 0, 0, 0.471 > 30, 0, 30, 0, 0.471 > 30, 30, 30, 0, 0.471 > 30, 2048, 0, 0, 0.471 > 30, 2078, 0, 0, 0.471 > 30, 2048, 30, 0, 0.471 > 30, 2078, 30, 0, 0.471 > 30, 4095, 0, 0, 0.974 > 30, 0, 4095, 0, 1.0 > 31, 0, 0, 0, 0.471 > 31, 31, 0, 0, 0.471 > 31, 0, 31, 0, 0.471 > 31, 31, 31, 0, 0.471 > 31, 2048, 0, 0, 0.471 > 31, 2079, 0, 0, 0.471 > 31, 2048, 31, 0, 0.471 > 31, 2079, 31, 0, 0.471 > 31, 4095, 0, 0, 0.974 > 31, 0, 4095, 0, 1.0 > 48, 0, 0, 0, 1.0 > 48, 0, 0, 1, 1.0 > 48, 3, 0, 0, 1.0 > 48, 3, 0, 1, 1.0 > 48, 0, 3, 0, 1.0 > 48, 0, 3, 1, 1.0 > 48, 3, 3, 0, 1.0 > 48, 3, 3, 1, 1.0 > 48, 2048, 0, 0, 1.0 > 48, 2048, 0, 1, 1.0 > 48, 2051, 0, 0, 1.0 > 48, 2051, 0, 1, 1.0 > 48, 2048, 3, 0, 1.0 > 48, 2048, 3, 1, 1.0 > 48, 2051, 3, 0, 1.0 > 48, 2051, 3, 1, 1.0 > 80, 0, 0, 0, 0.781 > 80, 0, 0, 1, 0.782 > 80, 5, 0, 0, 0.976 > 80, 5, 0, 1, 0.976 > 80, 0, 5, 0, 1.232 > 80, 0, 5, 1, 1.232 > 80, 5, 5, 0, 1.542 > 80, 5, 5, 1, 1.543 > 80, 2048, 0, 0, 0.781 > 80, 2048, 0, 1, 0.782 > 80, 2053, 0, 0, 0.976 > 80, 2053, 0, 1, 0.976 > 80, 2048, 5, 0, 1.093 > 80, 2048, 5, 1, 1.093 > 80, 2053, 5, 0, 1.371 > 80, 2053, 5, 1, 1.371 > 96, 0, 0, 0, 0.758 > 96, 0, 0, 1, 0.758 > 96, 6, 0, 0, 0.929 > 96, 6, 0, 1, 0.929 > 96, 0, 6, 0, 1.204 > 96, 0, 6, 1, 1.204 > 96, 6, 6, 0, 1.562 > 96, 6, 6, 1, 1.562 > 96, 2048, 0, 0, 0.758 > 96, 2048, 0, 1, 0.758 > 96, 2054, 0, 0, 0.929 > 96, 2054, 0, 1, 0.929 > 96, 2048, 6, 0, 1.068 > 96, 2048, 6, 1, 1.068 > 96, 2054, 6, 0, 1.562 > 96, 2054, 6, 1, 1.562 > 112, 0, 0, 0, 0.736 > 112, 0, 0, 1, 0.736 > 112, 7, 0, 0, 0.675 > 112, 7, 0, 1, 0.675 > 112, 0, 7, 0, 0.778 > 112, 0, 7, 1, 0.778 > 112, 7, 7, 0, 0.909 > 112, 7, 7, 1, 0.909 > 112, 2048, 0, 0, 0.736 > 112, 2048, 0, 1, 0.736 > 112, 2055, 0, 0, 0.675 > 112, 2055, 0, 1, 0.675 > 112, 2048, 7, 0, 0.778 > 112, 2048, 7, 1, 0.778 > 112, 2055, 7, 0, 0.909 > 112, 2055, 7, 1, 0.909 > 144, 0, 0, 0, 0.857 > 144, 0, 0, 1, 0.857 > 144, 9, 0, 0, 0.941 > 144, 9, 0, 1, 0.943 > 144, 0, 9, 0, 1.137 > 144, 0, 9, 1, 1.137 > 144, 9, 9, 0, 1.514 > 144, 9, 9, 1, 1.514 > 144, 2048, 0, 0, 0.857 > 144, 2048, 0, 1, 0.857 > 144, 2057, 0, 0, 0.939 > 144, 2057, 0, 1, 0.945 > 144, 2048, 9, 0, 0.922 > 144, 2048, 9, 1, 0.922 > 144, 2057, 9, 0, 1.514 > 144, 2057, 9, 1, 1.514 > 160, 0, 0, 0, 0.698 > 160, 0, 0, 1, 0.698 > 160, 10, 0, 0, 0.91 > 160, 10, 0, 1, 0.91 > 160, 0, 10, 0, 1.211 > 160, 0, 10, 1, 1.212 > 160, 10, 10, 0, 1.357 > 160, 10, 10, 1, 1.357 > 160, 2048, 0, 0, 0.698 > 160, 2048, 0, 1, 0.698 > 160, 2058, 0, 0, 0.91 > 160, 2058, 0, 1, 0.91 > 160, 2048, 10, 0, 0.923 > 160, 2048, 10, 1, 0.923 > 160, 2058, 10, 0, 1.357 > 160, 2058, 10, 1, 1.357 > 176, 0, 0, 0, 0.796 > 176, 0, 0, 1, 0.796 > 176, 11, 0, 0, 0.804 > 176, 11, 0, 1, 0.804 > 176, 0, 11, 0, 0.774 > 176, 0, 11, 1, 0.774 > 176, 11, 11, 0, 0.814 > 176, 11, 11, 1, 0.814 > 176, 2048, 0, 0, 0.796 > 176, 2048, 0, 1, 0.796 > 176, 2059, 0, 0, 0.804 > 176, 2059, 0, 1, 0.804 > 176, 2048, 11, 0, 0.774 > 176, 2048, 11, 1, 0.774 > 176, 2059, 11, 0, 0.814 > 176, 2059, 11, 1, 0.814 > 192, 0, 0, 0, 0.778 > 192, 0, 0, 1, 0.778 > 192, 12, 0, 0, 0.881 > 192, 12, 0, 1, 0.881 > 192, 0, 12, 0, 1.167 > 192, 0, 12, 1, 1.167 > 192, 12, 12, 0, 0.841 > 192, 12, 12, 1, 0.841 > 192, 2048, 0, 0, 0.778 > 192, 2048, 0, 1, 0.778 > 192, 2060, 0, 0, 0.881 > 192, 2060, 0, 1, 0.881 > 192, 2048, 12, 0, 0.889 > 192, 2048, 12, 1, 0.889 > 192, 2060, 12, 0, 0.906 > 192, 2060, 12, 1, 0.906 > 208, 0, 0, 0, 0.833 > 208, 0, 0, 1, 0.833 > 208, 13, 0, 0, 0.921 > 208, 13, 0, 1, 0.921 > 208, 0, 13, 0, 0.835 > 208, 0, 13, 1, 0.833 > 208, 13, 13, 0, 1.333 > 208, 13, 13, 1, 1.333 > 208, 2048, 0, 0, 0.833 > 208, 2048, 0, 1, 0.833 > 208, 2061, 0, 0, 0.921 > 208, 2061, 0, 1, 0.921 > 208, 2048, 13, 0, 0.833 > 208, 2048, 13, 1, 0.833 > 208, 2061, 13, 0, 1.333 > 208, 2061, 13, 1, 1.333 > 224, 0, 0, 0, 0.93 > 224, 0, 0, 1, 0.93 > 224, 14, 0, 0, 1.0 > 224, 14, 0, 1, 1.0 > 224, 0, 14, 0, 1.15 > 224, 0, 14, 1, 1.15 > 224, 14, 14, 0, 1.452 > 224, 14, 14, 1, 1.452 > 224, 2048, 0, 0, 0.93 > 224, 2048, 0, 1, 0.93 > 224, 2062, 0, 0, 1.0 > 224, 2062, 0, 1, 1.0 > 224, 2048, 14, 0, 0.833 > 224, 2048, 14, 1, 0.833 > 224, 2062, 14, 0, 1.452 > 224, 2062, 14, 1, 1.452 > 240, 0, 0, 0, 0.909 > 240, 0, 0, 1, 0.909 > 240, 15, 0, 0, 0.797 > 240, 15, 0, 1, 0.797 > 240, 0, 15, 0, 0.771 > 240, 0, 15, 1, 0.771 > 240, 15, 15, 0, 0.93 > 240, 15, 15, 1, 0.93 > 240, 2048, 0, 0, 0.909 > 240, 2048, 0, 1, 0.909 > 240, 2063, 0, 0, 0.797 > 240, 2063, 0, 1, 0.797 > 240, 2048, 15, 0, 0.771 > 240, 2048, 15, 1, 0.771 > 240, 2063, 15, 0, 0.93 > 240, 2063, 15, 1, 0.93 > 272, 0, 0, 0, 0.9 > 272, 0, 0, 1, 0.9 > 272, 17, 0, 0, 1.015 > 272, 17, 0, 1, 1.015 > 272, 0, 17, 0, 0.926 > 272, 0, 17, 1, 0.927 > 272, 17, 17, 0, 0.892 > 272, 17, 17, 1, 0.892 > 272, 2048, 0, 0, 0.9 > 272, 2048, 0, 1, 0.9 > 272, 2065, 0, 0, 1.015 > 272, 2065, 0, 1, 1.015 > 272, 2048, 17, 0, 0.927 > 272, 2048, 17, 1, 0.927 > 272, 2065, 17, 0, 0.878 > 272, 2065, 17, 1, 0.878 > 288, 0, 0, 0, 0.882 > 288, 0, 0, 1, 0.882 > 288, 18, 0, 0, 0.803 > 288, 18, 0, 1, 0.803 > 288, 0, 18, 0, 0.768 > 288, 0, 18, 1, 0.768 > 288, 18, 18, 0, 0.882 > 288, 18, 18, 1, 0.882 > 288, 2048, 0, 0, 0.882 > 288, 2048, 0, 1, 0.882 > 288, 2066, 0, 0, 0.803 > 288, 2066, 0, 1, 0.803 > 288, 2048, 18, 0, 0.768 > 288, 2048, 18, 1, 0.768 > 288, 2066, 18, 0, 0.882 > 288, 2066, 18, 1, 0.882 > 304, 0, 0, 0, 0.865 > 304, 0, 0, 1, 0.865 > 304, 19, 0, 0, 0.944 > 304, 19, 0, 1, 0.944 > 304, 0, 19, 0, 0.943 > 304, 0, 19, 1, 0.943 > 304, 19, 19, 0, 0.956 > 304, 19, 19, 1, 0.956 > 304, 2048, 0, 0, 0.866 > 304, 2048, 0, 1, 0.865 > 304, 2067, 0, 0, 0.944 > 304, 2067, 0, 1, 0.944 > 304, 2048, 19, 0, 0.943 > 304, 2048, 19, 1, 0.943 > 304, 2067, 19, 0, 0.947 > 304, 2067, 19, 1, 0.947 > 320, 0, 0, 0, 0.944 > 320, 0, 0, 1, 0.944 > 320, 20, 0, 0, 0.962 > 320, 20, 0, 1, 0.962 > 320, 0, 20, 0, 1.214 > 320, 0, 20, 1, 1.214 > 320, 20, 20, 0, 1.365 > 320, 20, 20, 1, 1.365 > 320, 2048, 0, 0, 0.943 > 320, 2048, 0, 1, 0.943 > 320, 2068, 0, 0, 0.962 > 320, 2068, 0, 1, 0.962 > 320, 2048, 20, 0, 0.914 > 320, 2048, 20, 1, 0.914 > 320, 2068, 20, 0, 1.365 > 320, 2068, 20, 1, 1.365 > 336, 0, 0, 0, 1.0 > 336, 0, 0, 1, 1.0 > 336, 21, 0, 0, 0.986 > 336, 21, 0, 1, 0.986 > 336, 0, 21, 0, 0.853 > 336, 0, 21, 1, 0.853 > 336, 21, 21, 0, 0.843 > 336, 21, 21, 1, 0.843 > 336, 2048, 0, 0, 1.0 > 336, 2048, 0, 1, 1.0 > 336, 2069, 0, 0, 0.986 > 336, 2069, 0, 1, 0.986 > 336, 2048, 21, 0, 0.853 > 336, 2048, 21, 1, 0.853 > 336, 2069, 21, 0, 0.831 > 336, 2069, 21, 1, 0.831 > 352, 0, 0, 0, 0.98 > 352, 0, 0, 1, 0.98 > 352, 22, 0, 0, 0.811 > 352, 22, 0, 1, 0.811 > 352, 0, 22, 0, 0.882 > 352, 0, 22, 1, 0.882 > 352, 22, 22, 0, 1.1 > 352, 22, 22, 1, 1.1 > 352, 2048, 0, 0, 0.98 > 352, 2048, 0, 1, 0.98 > 352, 2070, 0, 0, 0.811 > 352, 2070, 0, 1, 0.811 > 352, 2048, 22, 0, 0.882 > 352, 2048, 22, 1, 0.882 > 352, 2070, 22, 0, 1.1 > 352, 2070, 22, 1, 1.1 > 368, 0, 0, 0, 1.058 > 368, 0, 0, 1, 1.058 > 368, 23, 0, 0, 1.0 > 368, 23, 0, 1, 1.0 > 368, 0, 23, 0, 0.948 > 368, 0, 23, 1, 0.948 > 368, 23, 23, 0, 0.723 > 368, 23, 23, 1, 0.723 > 368, 2048, 0, 0, 1.058 > 368, 2048, 0, 1, 1.058 > 368, 2071, 0, 0, 1.0 > 368, 2071, 0, 1, 1.0 > 368, 2048, 23, 0, 0.948 > 368, 2048, 23, 1, 0.948 > 368, 2071, 23, 0, 0.701 > 368, 2071, 23, 1, 0.701 > 384, 0, 0, 0, 1.012 > 384, 0, 0, 1, 1.012 > 384, 24, 0, 0, 1.04 > 384, 24, 0, 1, 1.04 > 384, 0, 24, 0, 1.154 > 384, 0, 24, 1, 1.154 > 384, 24, 24, 0, 1.423 > 384, 24, 24, 1, 1.423 > 384, 2048, 0, 0, 1.012 > 384, 2048, 0, 1, 1.012 > 384, 2072, 0, 0, 1.04 > 384, 2072, 0, 1, 1.04 > 384, 2048, 24, 0, 0.91 > 384, 2048, 24, 1, 0.91 > 384, 2072, 24, 0, 1.423 > 384, 2072, 24, 1, 1.423 > 400, 0, 0, 0, 0.948 > 400, 0, 0, 1, 0.948 > 400, 25, 0, 0, 0.957 > 400, 25, 0, 1, 0.957 > 400, 0, 25, 0, 1.099 > 400, 0, 25, 1, 1.069 > 400, 25, 25, 0, 0.885 > 400, 25, 25, 1, 0.885 > 400, 2048, 0, 0, 0.948 > 400, 2048, 0, 1, 0.948 > 400, 2073, 0, 0, 0.957 > 400, 2073, 0, 1, 0.957 > 400, 2048, 25, 0, 0.94 > 400, 2048, 25, 1, 0.94 > 400, 2073, 25, 0, 0.908 > 400, 2073, 25, 1, 0.908 > 416, 0, 0, 0, 1.017 > 416, 0, 0, 1, 1.017 > 416, 26, 0, 0, 0.903 > 416, 26, 0, 1, 0.903 > 416, 0, 26, 0, 0.881 > 416, 0, 26, 1, 0.881 > 416, 26, 26, 0, 1.035 > 416, 26, 26, 1, 1.035 > 416, 2048, 0, 0, 1.017 > 416, 2048, 0, 1, 1.017 > 416, 2074, 0, 0, 0.903 > 416, 2074, 0, 1, 0.903 > 416, 2048, 26, 0, 0.881 > 416, 2048, 26, 1, 0.881 > 416, 2074, 26, 0, 1.034 > 416, 2074, 26, 1, 1.035 > 432, 0, 0, 0, 1.0 > 432, 0, 0, 1, 1.0 > 432, 27, 0, 0, 0.933 > 432, 27, 0, 1, 0.933 > 432, 0, 27, 0, 0.941 > 432, 0, 27, 1, 0.941 > 432, 27, 27, 0, 0.953 > 432, 27, 27, 1, 0.954 > 432, 2048, 0, 0, 1.0 > 432, 2048, 0, 1, 1.0 > 432, 2075, 0, 0, 0.933 > 432, 2075, 0, 1, 0.933 > 432, 2048, 27, 0, 0.941 > 432, 2048, 27, 1, 0.941 > 432, 2075, 27, 0, 0.93 > 432, 2075, 27, 1, 0.93 > 448, 0, 0, 0, 0.984 > 448, 0, 0, 1, 0.984 > 448, 28, 0, 0, 0.896 > 448, 28, 0, 1, 0.896 > 448, 0, 28, 0, 1.244 > 448, 0, 28, 1, 1.244 > 448, 28, 28, 0, 1.333 > 448, 28, 28, 1, 1.333 > 448, 2048, 0, 0, 0.984 > 448, 2048, 0, 1, 0.984 > 448, 2076, 0, 0, 0.896 > 448, 2076, 0, 1, 0.896 > 448, 2048, 28, 0, 0.988 > 448, 2048, 28, 1, 0.988 > 448, 2076, 28, 0, 1.333 > 448, 2076, 28, 1, 1.333 > 464, 0, 0, 0, 1.083 > 464, 0, 0, 1, 1.083 > 464, 29, 0, 0, 0.978 > 464, 29, 0, 1, 0.978 > 464, 0, 29, 0, 0.924 > 464, 0, 29, 1, 0.924 > 464, 29, 29, 0, 0.901 > 464, 29, 29, 1, 0.901 > 464, 2048, 0, 0, 1.083 > 464, 2048, 0, 1, 1.083 > 464, 2077, 0, 0, 0.978 > 464, 2077, 0, 1, 0.978 > 464, 2048, 29, 0, 0.924 > 464, 2048, 29, 1, 0.924 > 464, 2077, 29, 0, 0.89 > 464, 2077, 29, 1, 0.89 > 480, 0, 0, 0, 1.066 > 480, 0, 0, 1, 1.066 > 480, 30, 0, 0, 0.9 > 480, 30, 0, 1, 0.9 > 480, 0, 30, 0, 0.88 > 480, 0, 30, 1, 0.88 > 480, 30, 30, 0, 1.083 > 480, 30, 30, 1, 1.083 > 480, 2048, 0, 0, 1.066 > 480, 2048, 0, 1, 1.066 > 480, 2078, 0, 0, 0.9 > 480, 2078, 0, 1, 0.9 > 480, 2048, 30, 0, 0.88 > 480, 2048, 30, 1, 0.88 > 480, 2078, 30, 0, 1.083 > 480, 2078, 30, 1, 1.083 > 496, 0, 0, 0, 1.032 > 496, 0, 0, 1, 1.032 > 496, 31, 0, 0, 0.95 > 496, 31, 0, 1, 0.95 > 496, 0, 31, 0, 1.011 > 496, 0, 31, 1, 1.011 > 496, 31, 31, 0, 0.973 > 496, 31, 31, 1, 0.973 > 496, 2048, 0, 0, 1.032 > 496, 2048, 0, 1, 1.032 > 496, 2079, 0, 0, 0.95 > 496, 2079, 0, 1, 0.95 > 496, 2048, 31, 0, 1.011 > 496, 2048, 31, 1, 1.011 > 496, 2079, 31, 0, 0.941 > 496, 2079, 31, 1, 0.941 > 1024, 32, 0, 0, 1.143 > 1024, 32, 0, 1, 1.143 > 1024, 0, 32, 0, 1.143 > 1024, 0, 32, 1, 1.143 > 1024, 32, 32, 0, 1.143 > 1024, 32, 32, 1, 1.143 > 1024, 2080, 0, 0, 1.143 > 1024, 2080, 0, 1, 1.143 > 1024, 2048, 32, 0, 1.143 > 1024, 2048, 32, 1, 1.143 > 1024, 2080, 32, 0, 1.143 > 1024, 2080, 32, 1, 1.143 > 1056, 0, 0, 0, 1.168 > 1056, 0, 0, 1, 1.168 > 1056, 33, 0, 0, 1.067 > 1056, 33, 0, 1, 1.067 > 1056, 0, 33, 0, 0.977 > 1056, 0, 33, 1, 0.977 > 1056, 33, 33, 0, 1.043 > 1056, 33, 33, 1, 1.043 > 1056, 2048, 0, 0, 1.168 > 1056, 2048, 0, 1, 1.168 > 1056, 2081, 0, 0, 1.067 > 1056, 2081, 0, 1, 1.067 > 1056, 2048, 33, 0, 0.977 > 1056, 2048, 33, 1, 0.977 > 1056, 2081, 33, 0, 1.0 > 1056, 2081, 33, 1, 1.0 > 1088, 0, 0, 0, 1.171 > 1088, 0, 0, 1, 1.171 > 1088, 34, 0, 0, 1.041 > 1088, 34, 0, 1, 1.041 > 1088, 0, 34, 0, 1.079 > 1088, 0, 34, 1, 1.079 > 1088, 34, 34, 0, 0.966 > 1088, 34, 34, 1, 0.966 > 1088, 2048, 0, 0, 1.171 > 1088, 2048, 0, 1, 1.171 > 1088, 2082, 0, 0, 1.041 > 1088, 2082, 0, 1, 1.041 > 1088, 2048, 34, 0, 0.994 > 1088, 2048, 34, 1, 0.994 > 1088, 2082, 34, 0, 0.966 > 1088, 2082, 34, 1, 0.966 > 1120, 0, 0, 0, 1.152 > 1120, 0, 0, 1, 1.153 > 1120, 35, 0, 0, 1.051 > 1120, 35, 0, 1, 1.051 > 1120, 0, 35, 0, 1.0 > 1120, 0, 35, 1, 1.0 > 1120, 35, 35, 0, 1.068 > 1120, 35, 35, 1, 1.068 > 1120, 2048, 0, 0, 1.151 > 1120, 2048, 0, 1, 1.151 > 1120, 2083, 0, 0, 1.051 > 1120, 2083, 0, 1, 1.051 > 1120, 2048, 35, 0, 1.0 > 1120, 2048, 35, 1, 1.0 > 1120, 2083, 35, 0, 1.027 > 1120, 2083, 35, 1, 1.027 > 1152, 0, 0, 0, 1.159 > 1152, 0, 0, 1, 1.159 > 1152, 36, 0, 0, 1.034 > 1152, 36, 0, 1, 1.034 > 1152, 0, 36, 0, 1.07 > 1152, 0, 36, 1, 1.07 > 1152, 36, 36, 0, 0.967 > 1152, 36, 36, 1, 0.967 > 1152, 2048, 0, 0, 1.159 > 1152, 2048, 0, 1, 1.159 > 1152, 2084, 0, 0, 1.034 > 1152, 2084, 0, 1, 1.034 > 1152, 2048, 36, 0, 0.984 > 1152, 2048, 36, 1, 0.984 > 1152, 2084, 36, 0, 0.967 > 1152, 2084, 36, 1, 0.967 > 1184, 0, 0, 0, 1.157 > 1184, 0, 0, 1, 1.157 > 1184, 37, 0, 0, 1.067 > 1184, 37, 0, 1, 1.066 > 1184, 0, 37, 0, 0.993 > 1184, 0, 37, 1, 0.993 > 1184, 37, 37, 0, 1.08 > 1184, 37, 37, 1, 1.081 > 1184, 2048, 0, 0, 1.157 > 1184, 2048, 0, 1, 1.157 > 1184, 2085, 0, 0, 1.066 > 1184, 2085, 0, 1, 1.066 > 1184, 2048, 37, 0, 0.993 > 1184, 2048, 37, 1, 0.993 > 1184, 2085, 37, 0, 1.04 > 1184, 2085, 37, 1, 1.04 > 1216, 0, 0, 0, 1.139 > 1216, 0, 0, 1, 1.139 > 1216, 38, 0, 0, 1.024 > 1216, 38, 0, 1, 1.024 > 1216, 0, 38, 0, 1.087 > 1216, 0, 38, 1, 1.087 > 1216, 38, 38, 0, 1.0 > 1216, 38, 38, 1, 1.0 > 1216, 2048, 0, 0, 1.138 > 1216, 2048, 0, 1, 1.138 > 1216, 2086, 0, 0, 1.024 > 1216, 2086, 0, 1, 1.024 > 1216, 2048, 38, 0, 1.01 > 1216, 2048, 38, 1, 1.01 > 1216, 2086, 38, 0, 1.0 > 1216, 2086, 38, 1, 1.0 > 1248, 0, 0, 0, 1.176 > 1248, 0, 0, 1, 1.174 > 1248, 39, 0, 0, 1.074 > 1248, 39, 0, 1, 1.074 > 1248, 0, 39, 0, 0.966 > 1248, 0, 39, 1, 0.985 > 1248, 39, 39, 0, 1.064 > 1248, 39, 39, 1, 1.064 > 1248, 2048, 0, 0, 1.179 > 1248, 2048, 0, 1, 1.179 > 1248, 2087, 0, 0, 1.074 > 1248, 2087, 0, 1, 1.074 > 1248, 2048, 39, 0, 0.985 > 1248, 2048, 39, 1, 0.985 > 1248, 2087, 39, 0, 1.026 > 1248, 2087, 39, 1, 1.026 > 1280, 0, 0, 0, 0.993 > 1280, 0, 0, 1, 0.993 > 1280, 40, 0, 0, 1.051 > 1280, 40, 0, 1, 1.051 > 1280, 0, 40, 0, 1.044 > 1280, 0, 40, 1, 1.045 > 1280, 40, 40, 0, 1.25 > 1280, 40, 40, 1, 1.25 > 1280, 2048, 0, 0, 0.992 > 1280, 2048, 0, 1, 0.992 > 1280, 2088, 0, 0, 1.051 > 1280, 2088, 0, 1, 1.051 > 1280, 2048, 40, 0, 0.946 > 1280, 2048, 40, 1, 0.946 > 1280, 2088, 40, 0, 1.252 > 1280, 2088, 40, 1, 1.252 > 1312, 0, 0, 0, 0.969 > 1312, 0, 0, 1, 0.969 > 1312, 41, 0, 0, 0.991 > 1312, 41, 0, 1, 0.991 > 1312, 0, 41, 0, 0.837 > 1312, 0, 41, 1, 0.837 > 1312, 41, 41, 0, 1.025 > 1312, 41, 41, 1, 1.025 > 1312, 2048, 0, 0, 0.969 > 1312, 2048, 0, 1, 0.969 > 1312, 2089, 0, 0, 0.991 > 1312, 2089, 0, 1, 0.99 > 1312, 2048, 41, 0, 0.837 > 1312, 2048, 41, 1, 0.837 > 1312, 2089, 41, 0, 0.975 > 1312, 2089, 41, 1, 0.975 > 1344, 0, 0, 0, 0.988 > 1344, 0, 0, 1, 0.988 > 1344, 42, 0, 0, 1.031 > 1344, 42, 0, 1, 1.031 > 1344, 0, 42, 0, 1.033 > 1344, 0, 42, 1, 1.033 > 1344, 42, 42, 0, 0.982 > 1344, 42, 42, 1, 0.982 > 1344, 2048, 0, 0, 0.992 > 1344, 2048, 0, 1, 0.992 > 1344, 2090, 0, 0, 1.031 > 1344, 2090, 0, 1, 1.031 > 1344, 2048, 42, 0, 0.943 > 1344, 2048, 42, 1, 0.942 > 1344, 2090, 42, 0, 0.982 > 1344, 2090, 42, 1, 0.982 > 1376, 0, 0, 0, 1.016 > 1376, 0, 0, 1, 1.016 > 1376, 43, 0, 0, 1.01 > 1376, 43, 0, 1, 1.01 > 1376, 0, 43, 0, 0.829 > 1376, 0, 43, 1, 0.829 > 1376, 43, 43, 0, 1.024 > 1376, 43, 43, 1, 1.024 > 1376, 2048, 0, 0, 1.006 > 1376, 2048, 0, 1, 1.015 > 1376, 2091, 0, 0, 1.01 > 1376, 2091, 0, 1, 1.01 > 1376, 2048, 43, 0, 0.829 > 1376, 2048, 43, 1, 0.829 > 1376, 2091, 43, 0, 0.98 > 1376, 2091, 43, 1, 0.98 > 1408, 0, 0, 0, 0.987 > 1408, 0, 0, 1, 0.987 > 1408, 44, 0, 0, 1.015 > 1408, 44, 0, 1, 1.015 > 1408, 0, 44, 0, 1.018 > 1408, 0, 44, 1, 1.014 > 1408, 44, 44, 0, 1.004 > 1408, 44, 44, 1, 0.994 > 1408, 2048, 0, 0, 0.988 > 1408, 2048, 0, 1, 0.988 > 1408, 2092, 0, 0, 1.015 > 1408, 2092, 0, 1, 1.015 > 1408, 2048, 44, 0, 0.955 > 1408, 2048, 44, 1, 0.955 > 1408, 2092, 44, 0, 1.0 > 1408, 2092, 44, 1, 0.994 > 1440, 0, 0, 0, 0.986 > 1440, 0, 0, 1, 0.986 > 1440, 45, 0, 0, 1.013 > 1440, 45, 0, 1, 1.013 > 1440, 0, 45, 0, 0.814 > 1440, 0, 45, 1, 0.814 > 1440, 45, 45, 0, 1.006 > 1440, 45, 45, 1, 1.006 > 1440, 2048, 0, 0, 0.986 > 1440, 2048, 0, 1, 0.986 > 1440, 2093, 0, 0, 1.013 > 1440, 2093, 0, 1, 1.013 > 1440, 2048, 45, 0, 0.814 > 1440, 2048, 45, 1, 0.814 > 1440, 2093, 45, 0, 0.966 > 1440, 2093, 45, 1, 0.966 > 1472, 0, 0, 0, 0.997 > 1472, 0, 0, 1, 0.994 > 1472, 46, 0, 0, 1.045 > 1472, 46, 0, 1, 1.045 > 1472, 0, 46, 0, 1.026 > 1472, 0, 46, 1, 1.026 > 1472, 46, 46, 0, 0.966 > 1472, 46, 46, 1, 0.966 > 1472, 2048, 0, 0, 1.0 > 1472, 2048, 0, 1, 0.996 > 1472, 2094, 0, 0, 1.045 > 1472, 2094, 0, 1, 1.045 > 1472, 2048, 46, 0, 0.939 > 1472, 2048, 46, 1, 0.939 > 1472, 2094, 46, 0, 0.966 > 1472, 2094, 46, 1, 0.966 > 1504, 0, 0, 0, 0.993 > 1504, 0, 0, 1, 0.993 > 1504, 47, 0, 0, 0.999 > 1504, 47, 0, 1, 0.999 > 1504, 0, 47, 0, 0.826 > 1504, 0, 47, 1, 0.826 > 1504, 47, 47, 0, 1.023 > 1504, 47, 47, 1, 1.023 > 1504, 2048, 0, 0, 0.993 > 1504, 2048, 0, 1, 0.993 > 1504, 2095, 0, 0, 0.999 > 1504, 2095, 0, 1, 0.999 > 1504, 2048, 47, 0, 0.826 > 1504, 2048, 47, 1, 0.826 > 1504, 2095, 47, 0, 0.993 > 1504, 2095, 47, 1, 0.993 > 1536, 0, 0, 0, 0.992 > 1536, 0, 0, 1, 0.991 > 1536, 48, 0, 0, 1.019 > 1536, 48, 0, 1, 1.019 > 1536, 0, 48, 0, 1.025 > 1536, 0, 48, 1, 1.024 > 1536, 48, 48, 0, 0.994 > 1536, 48, 48, 1, 0.994 > 1536, 2048, 0, 0, 0.994 > 1536, 2048, 0, 1, 0.994 > 1536, 2096, 0, 0, 1.019 > 1536, 2096, 0, 1, 1.019 > 1536, 2048, 48, 0, 1.025 > 1536, 2048, 48, 1, 1.025 > 1536, 2096, 48, 0, 0.994 > 1536, 2096, 48, 1, 0.994 > 1568, 0, 0, 0, 0.994 > 1568, 0, 0, 1, 0.994 > 1568, 49, 0, 0, 0.903 > 1568, 49, 0, 1, 0.903 > 1568, 0, 49, 0, 1.144 > 1568, 0, 49, 1, 1.144 > 1568, 49, 49, 0, 1.461 > 1568, 49, 49, 1, 1.461 > 1568, 2048, 0, 0, 0.993 > 1568, 2048, 0, 1, 0.993 > 1568, 2097, 0, 0, 0.903 > 1568, 2097, 0, 1, 0.903 > 1568, 2048, 49, 0, 1.09 > 1568, 2048, 49, 1, 1.09 > 1568, 2097, 49, 0, 1.46 > 1568, 2097, 49, 1, 1.46 > 1600, 0, 0, 0, 0.981 > 1600, 0, 0, 1, 0.981 > 1600, 50, 0, 0, 1.022 > 1600, 50, 0, 1, 1.022 > 1600, 0, 50, 0, 1.017 > 1600, 0, 50, 1, 1.017 > 1600, 50, 50, 0, 0.973 > 1600, 50, 50, 1, 0.973 > 1600, 2048, 0, 0, 0.981 > 1600, 2048, 0, 1, 0.981 > 1600, 2098, 0, 0, 1.022 > 1600, 2098, 0, 1, 1.022 > 1600, 2048, 50, 0, 0.961 > 1600, 2048, 50, 1, 0.961 > 1600, 2098, 50, 0, 0.973 > 1600, 2098, 50, 1, 0.973 > 1632, 0, 0, 0, 1.019 > 1632, 0, 0, 1, 1.019 > 1632, 51, 0, 0, 0.893 > 1632, 51, 0, 1, 0.893 > 1632, 0, 51, 0, 1.131 > 1632, 0, 51, 1, 1.131 > 1632, 51, 51, 0, 1.444 > 1632, 51, 51, 1, 1.444 > 1632, 2048, 0, 0, 1.019 > 1632, 2048, 0, 1, 1.019 > 1632, 2099, 0, 0, 0.893 > 1632, 2099, 0, 1, 0.893 > 1632, 2048, 51, 0, 1.079 > 1632, 2048, 51, 1, 1.079 > 1632, 2099, 51, 0, 1.449 > 1632, 2099, 51, 1, 1.449 > 1664, 0, 0, 0, 1.005 > 1664, 0, 0, 1, 1.004 > 1664, 52, 0, 0, 0.986 > 1664, 52, 0, 1, 0.986 > 1664, 0, 52, 0, 1.004 > 1664, 0, 52, 1, 1.004 > 1664, 52, 52, 0, 0.976 > 1664, 52, 52, 1, 0.976 > 1664, 2048, 0, 0, 1.006 > 1664, 2048, 0, 1, 1.006 > 1664, 2100, 0, 0, 0.993 > 1664, 2100, 0, 1, 0.993 > 1664, 2048, 52, 0, 0.946 > 1664, 2048, 52, 1, 0.946 > 1664, 2100, 52, 0, 0.976 > 1664, 2100, 52, 1, 0.976 > 1696, 0, 0, 0, 0.994 > 1696, 0, 0, 1, 0.992 > 1696, 53, 0, 0, 0.884 > 1696, 53, 0, 1, 0.884 > 1696, 0, 53, 0, 1.141 > 1696, 0, 53, 1, 1.141 > 1696, 53, 53, 0, 1.43 > 1696, 53, 53, 1, 1.43 > 1696, 2048, 0, 0, 0.994 > 1696, 2048, 0, 1, 0.994 > 1696, 2101, 0, 0, 0.884 > 1696, 2101, 0, 1, 0.884 > 1696, 2048, 53, 0, 1.088 > 1696, 2048, 53, 1, 1.088 > 1696, 2101, 53, 0, 1.429 > 1696, 2101, 53, 1, 1.429 > 1728, 0, 0, 0, 0.978 > 1728, 0, 0, 1, 0.978 > 1728, 54, 0, 0, 1.031 > 1728, 54, 0, 1, 1.033 > 1728, 0, 54, 0, 1.0 > 1728, 0, 54, 1, 1.0 > 1728, 54, 54, 0, 0.96 > 1728, 54, 54, 1, 0.96 > 1728, 2048, 0, 0, 0.976 > 1728, 2048, 0, 1, 0.976 > 1728, 2102, 0, 0, 1.033 > 1728, 2102, 0, 1, 1.033 > 1728, 2048, 54, 0, 0.947 > 1728, 2048, 54, 1, 0.947 > 1728, 2102, 54, 0, 0.96 > 1728, 2102, 54, 1, 0.96 > 1760, 0, 0, 0, 1.019 > 1760, 0, 0, 1, 1.021 > 1760, 55, 0, 0, 0.9 > 1760, 55, 0, 1, 0.9 > 1760, 0, 55, 0, 1.125 > 1760, 0, 55, 1, 1.125 > 1760, 55, 55, 0, 1.437 > 1760, 55, 55, 1, 1.436 > 1760, 2048, 0, 0, 1.016 > 1760, 2048, 0, 1, 1.015 > 1760, 2103, 0, 0, 0.9 > 1760, 2103, 0, 1, 0.9 > 1760, 2048, 55, 0, 1.073 > 1760, 2048, 55, 1, 1.074 > 1760, 2103, 55, 0, 1.44 > 1760, 2103, 55, 1, 1.44 > 1792, 0, 0, 0, 1.002 > 1792, 0, 0, 1, 1.002 > 1792, 56, 0, 0, 1.028 > 1792, 56, 0, 1, 1.028 > 1792, 0, 56, 0, 1.014 > 1792, 0, 56, 1, 1.015 > 1792, 56, 56, 0, 1.191 > 1792, 56, 56, 1, 1.191 > 1792, 2048, 0, 0, 1.003 > 1792, 2048, 0, 1, 1.003 > 1792, 2104, 0, 0, 1.028 > 1792, 2104, 0, 1, 1.028 > 1792, 2048, 56, 0, 0.963 > 1792, 2048, 56, 1, 0.963 > 1792, 2104, 56, 0, 1.191 > 1792, 2104, 56, 1, 1.191 > 1824, 0, 0, 0, 0.999 > 1824, 0, 0, 1, 1.0 > 1824, 57, 0, 0, 0.891 > 1824, 57, 0, 1, 0.891 > 1824, 0, 57, 0, 1.114 > 1824, 0, 57, 1, 1.114 > 1824, 57, 57, 0, 1.407 > 1824, 57, 57, 1, 1.407 > 1824, 2048, 0, 0, 1.001 > 1824, 2048, 0, 1, 1.001 > 1824, 2105, 0, 0, 0.891 > 1824, 2105, 0, 1, 0.891 > 1824, 2048, 57, 0, 1.064 > 1824, 2048, 57, 1, 1.064 > 1824, 2105, 57, 0, 1.407 > 1824, 2105, 57, 1, 1.407 > 1856, 0, 0, 0, 0.989 > 1856, 0, 0, 1, 0.987 > 1856, 58, 0, 0, 1.042 > 1856, 58, 0, 1, 1.042 > 1856, 0, 58, 0, 1.007 > 1856, 0, 58, 1, 1.007 > 1856, 58, 58, 0, 0.978 > 1856, 58, 58, 1, 0.972 > 1856, 2048, 0, 0, 0.992 > 1856, 2048, 0, 1, 0.992 > 1856, 2106, 0, 0, 1.042 > 1856, 2106, 0, 1, 1.042 > 1856, 2048, 58, 0, 0.954 > 1856, 2048, 58, 1, 0.954 > 1856, 2106, 58, 0, 0.979 > 1856, 2106, 58, 1, 0.972 > 1888, 0, 0, 0, 0.994 > 1888, 0, 0, 1, 0.994 > 1888, 59, 0, 0, 0.883 > 1888, 59, 0, 1, 0.883 > 1888, 0, 59, 0, 1.121 > 1888, 0, 59, 1, 1.123 > 1888, 59, 59, 0, 1.413 > 1888, 59, 59, 1, 1.413 > 1888, 2048, 0, 0, 0.985 > 1888, 2048, 0, 1, 0.994 > 1888, 2107, 0, 0, 0.883 > 1888, 2107, 0, 1, 0.883 > 1888, 2048, 59, 0, 1.076 > 1888, 2048, 59, 1, 1.076 > 1888, 2107, 59, 0, 1.413 > 1888, 2107, 59, 1, 1.413 > 1920, 0, 0, 0, 1.0 > 1920, 0, 0, 1, 0.999 > 1920, 60, 0, 0, 1.033 > 1920, 60, 0, 1, 1.033 > 1920, 0, 60, 0, 0.996 > 1920, 0, 60, 1, 0.997 > 1920, 60, 60, 0, 0.968 > 1920, 60, 60, 1, 0.968 > 1920, 2048, 0, 0, 1.0 > 1920, 2048, 0, 1, 1.0 > 1920, 2108, 0, 0, 1.034 > 1920, 2108, 0, 1, 1.034 > 1920, 2048, 60, 0, 0.949 > 1920, 2048, 60, 1, 0.949 > 1920, 2108, 60, 0, 0.968 > 1920, 2108, 60, 1, 0.968 > 1952, 0, 0, 0, 1.004 > 1952, 0, 0, 1, 1.004 > 1952, 61, 0, 0, 0.898 > 1952, 61, 0, 1, 0.898 > 1952, 0, 61, 0, 1.118 > 1952, 0, 61, 1, 1.118 > 1952, 61, 61, 0, 1.387 > 1952, 61, 61, 1, 1.387 > 1952, 2048, 0, 0, 1.004 > 1952, 2048, 0, 1, 1.004 > 1952, 2109, 0, 0, 0.898 > 1952, 2109, 0, 1, 0.898 > 1952, 2048, 61, 0, 1.071 > 1952, 2048, 61, 1, 1.071 > 1952, 2109, 61, 0, 1.387 > 1952, 2109, 61, 1, 1.387 > 1984, 0, 0, 0, 0.993 > 1984, 0, 0, 1, 0.993 > 1984, 62, 0, 0, 1.025 > 1984, 62, 0, 1, 1.025 > 1984, 0, 62, 0, 1.005 > 1984, 0, 62, 1, 1.007 > 1984, 62, 62, 0, 0.982 > 1984, 62, 62, 1, 0.982 > 1984, 2048, 0, 0, 0.993 > 1984, 2048, 0, 1, 0.993 > 1984, 2110, 0, 0, 1.025 > 1984, 2110, 0, 1, 1.025 > 1984, 2048, 62, 0, 0.96 > 1984, 2048, 62, 1, 0.96 > 1984, 2110, 62, 0, 0.982 > 1984, 2110, 62, 1, 0.982 > 2016, 0, 0, 0, 1.0 > 2016, 0, 0, 1, 0.999 > 2016, 63, 0, 0, 0.889 > 2016, 63, 0, 1, 0.89 > 2016, 0, 63, 0, 1.091 > 2016, 0, 63, 1, 1.092 > 2016, 63, 63, 0, 1.362 > 2016, 63, 63, 1, 1.363 > 2016, 2048, 0, 0, 1.0 > 2016, 2048, 0, 1, 1.0 > 2016, 2111, 0, 0, 0.965 > 2016, 2111, 0, 1, 0.965 > 2016, 2048, 63, 0, 1.049 > 2016, 2048, 63, 1, 1.049 > 2016, 2111, 63, 0, 1.405 > 2016, 2111, 63, 1, 1.405 > 2048, 32, 0, 0, 1.01 > 2048, 32, 0, 1, 1.01 > 2048, 0, 32, 0, 1.005 > 2048, 0, 32, 1, 1.005 > 2048, 32, 32, 0, 1.005 > 2048, 32, 32, 1, 1.005 > 2048, 0, 1, 0, 0.983 > 2048, 0, 1, 1, 0.984 > 2048, 1, 0, 0, 1.039 > 2048, 1, 0, 1, 1.039 > 2048, 32, 1, 0, 1.063 > 2048, 32, 1, 1, 1.063 > 2048, 1, 32, 0, 0.94 > 2048, 1, 32, 1, 0.94 > 2048, 2048, 1, 0, 0.981 > 2048, 2048, 1, 1, 0.981 > 2048, 2049, 0, 0, 0.904 > 2048, 2049, 0, 1, 0.904 > 2112, 0, 0, 0, 0.996 > 2112, 0, 0, 1, 0.995 > 2112, 1, 0, 0, 1.031 > 2112, 1, 0, 1, 1.031 > 2112, 33, 0, 0, 1.01 > 2112, 33, 0, 1, 1.01 > 2112, 0, 1, 0, 0.972 > 2112, 0, 1, 1, 0.972 > 2112, 0, 33, 0, 0.987 > 2112, 0, 33, 1, 0.987 > 2112, 1, 1, 0, 0.914 > 2112, 1, 1, 1, 0.914 > 2112, 33, 33, 0, 0.983 > 2112, 33, 33, 1, 0.983 > 2112, 2048, 0, 0, 0.994 > 2112, 2048, 0, 1, 0.99 > 2112, 2049, 0, 0, 1.031 > 2112, 2049, 0, 1, 1.031 > 2112, 2048, 1, 0, 0.955 > 2112, 2048, 1, 1, 0.955 > 2112, 2049, 1, 0, 0.906 > 2112, 2049, 1, 1, 0.906 > 2112, 33, 1, 0, 1.163 > 2112, 33, 1, 1, 1.164 > 2112, 1, 33, 0, 1.046 > 2112, 1, 33, 1, 1.046 > 2176, 0, 0, 0, 0.984 > 2176, 0, 0, 1, 0.985 > 2176, 2, 0, 0, 1.023 > 2176, 2, 0, 1, 1.023 > 2176, 34, 0, 0, 1.0 > 2176, 34, 0, 1, 1.0 > 2176, 0, 2, 0, 0.985 > 2176, 0, 2, 1, 0.985 > 2176, 0, 34, 0, 0.995 > 2176, 0, 34, 1, 0.982 > 2176, 2, 2, 0, 0.928 > 2176, 2, 2, 1, 0.928 > 2176, 34, 34, 0, 1.004 > 2176, 34, 34, 1, 1.004 > 2176, 2048, 0, 0, 0.985 > 2176, 2048, 0, 1, 0.986 > 2176, 2050, 0, 0, 1.023 > 2176, 2050, 0, 1, 1.023 > 2176, 2048, 2, 0, 0.802 > 2176, 2048, 2, 1, 0.802 > 2176, 2050, 2, 0, 0.894 > 2176, 2050, 2, 1, 0.894 > 2176, 2, 1, 0, 1.068 > 2176, 2, 1, 1, 1.068 > 2176, 1, 2, 0, 0.976 > 2176, 1, 2, 1, 0.976 > 2176, 34, 1, 0, 1.077 > 2176, 34, 1, 1, 1.077 > 2176, 1, 34, 0, 0.978 > 2176, 1, 34, 1, 0.978 > 2176, 2050, 1, 0, 1.061 > 2176, 2050, 1, 1, 1.061 > 2176, 2049, 2, 0, 0.971 > 2176, 2049, 2, 1, 0.971 > 2240, 0, 0, 0, 0.994 > 2240, 0, 0, 1, 0.994 > 2240, 3, 0, 0, 1.038 > 2240, 3, 0, 1, 1.039 > 2240, 35, 0, 0, 1.019 > 2240, 35, 0, 1, 1.019 > 2240, 0, 3, 0, 0.979 > 2240, 0, 3, 1, 0.98 > 2240, 0, 35, 0, 0.991 > 2240, 0, 35, 1, 0.991 > 2240, 3, 3, 0, 0.931 > 2240, 3, 3, 1, 0.931 > 2240, 35, 35, 0, 0.999 > 2240, 35, 35, 1, 0.999 > 2240, 2048, 0, 0, 0.995 > 2240, 2048, 0, 1, 0.995 > 2240, 2051, 0, 0, 1.039 > 2240, 2051, 0, 1, 1.039 > 2240, 2048, 3, 0, 0.799 > 2240, 2048, 3, 1, 0.799 > 2240, 2051, 3, 0, 0.889 > 2240, 2051, 3, 1, 0.889 > 2240, 3, 1, 0, 1.06 > 2240, 3, 1, 1, 1.06 > 2240, 1, 3, 0, 0.968 > 2240, 1, 3, 1, 0.968 > 2240, 35, 1, 0, 1.071 > 2240, 35, 1, 1, 1.071 > 2240, 1, 35, 0, 0.971 > 2240, 1, 35, 1, 0.971 > 2240, 2051, 1, 0, 1.057 > 2240, 2051, 1, 1, 1.057 > 2240, 2049, 3, 0, 0.966 > 2240, 2049, 3, 1, 0.966 > 2304, 0, 0, 0, 0.986 > 2304, 0, 0, 1, 0.986 > 2304, 4, 0, 0, 1.031 > 2304, 4, 0, 1, 1.032 > 2304, 36, 0, 0, 1.011 > 2304, 36, 0, 1, 1.011 > 2304, 0, 4, 0, 0.968 > 2304, 0, 4, 1, 0.969 > 2304, 0, 36, 0, 0.988 > 2304, 0, 36, 1, 0.988 > 2304, 4, 4, 0, 0.93 > 2304, 4, 4, 1, 0.931 > 2304, 36, 36, 0, 0.992 > 2304, 36, 36, 1, 0.992 > 2304, 2048, 0, 0, 0.988 > 2304, 2048, 0, 1, 0.988 > 2304, 2052, 0, 0, 1.032 > 2304, 2052, 0, 1, 1.032 > 2304, 2048, 4, 0, 0.793 > 2304, 2048, 4, 1, 0.793 > 2304, 2052, 4, 0, 0.884 > 2304, 2052, 4, 1, 0.884 > 2304, 4, 1, 0, 0.989 > 2304, 4, 1, 1, 0.989 > 2304, 1, 4, 0, 0.897 > 2304, 1, 4, 1, 0.898 > 2304, 36, 1, 0, 1.057 > 2304, 36, 1, 1, 1.057 > 2304, 1, 36, 0, 0.966 > 2304, 1, 36, 1, 0.966 > 2304, 2052, 1, 0, 1.052 > 2304, 2052, 1, 1, 1.052 > 2304, 2049, 4, 0, 0.955 > 2304, 2049, 4, 1, 0.955 > 2368, 0, 0, 0, 1.0 > 2368, 0, 0, 1, 1.001 > 2368, 5, 0, 0, 1.024 > 2368, 5, 0, 1, 1.025 > 2368, 37, 0, 0, 1.0 > 2368, 37, 0, 1, 1.0 > 2368, 0, 5, 0, 0.98 > 2368, 0, 5, 1, 0.981 > 2368, 0, 37, 0, 0.983 > 2368, 0, 37, 1, 0.98 > 2368, 5, 5, 0, 0.944 > 2368, 5, 5, 1, 0.944 > 2368, 37, 37, 0, 1.003 > 2368, 37, 37, 1, 1.003 > 2368, 2048, 0, 0, 1.002 > 2368, 2048, 0, 1, 1.002 > 2368, 2053, 0, 0, 1.025 > 2368, 2053, 0, 1, 1.025 > 2368, 2048, 5, 0, 0.801 > 2368, 2048, 5, 1, 0.801 > 2368, 2053, 5, 0, 0.907 > 2368, 2053, 5, 1, 0.907 > 2368, 5, 1, 0, 1.071 > 2368, 5, 1, 1, 1.071 > 2368, 1, 5, 0, 0.973 > 2368, 1, 5, 1, 0.973 > 2368, 37, 1, 0, 1.07 > 2368, 37, 1, 1, 1.07 > 2368, 1, 37, 0, 0.974 > 2368, 1, 37, 1, 0.974 > 2368, 2053, 1, 0, 1.065 > 2368, 2053, 1, 1, 1.065 > 2368, 2049, 5, 0, 0.967 > 2368, 2049, 5, 1, 0.967 > 2432, 0, 0, 0, 0.965 > 2432, 0, 0, 1, 1.0 > 2432, 6, 0, 0, 1.038 > 2432, 6, 0, 1, 1.039 > 2432, 38, 0, 0, 1.021 > 2432, 38, 0, 1, 1.021 > 2432, 0, 6, 0, 0.974 > 2432, 0, 6, 1, 0.976 > 2432, 0, 38, 0, 0.986 > 2432, 0, 38, 1, 0.986 > 2432, 6, 6, 0, 0.926 > 2432, 6, 6, 1, 0.926 > 2432, 38, 38, 0, 1.0 > 2432, 38, 38, 1, 1.0 > 2432, 2048, 0, 0, 1.004 > 2432, 2048, 0, 1, 1.004 > 2432, 2054, 0, 0, 1.039 > 2432, 2054, 0, 1, 1.039 > 2432, 2048, 6, 0, 0.797 > 2432, 2048, 6, 1, 0.797 > 2432, 2054, 6, 0, 0.898 > 2432, 2054, 6, 1, 0.898 > 2432, 6, 1, 0, 1.063 > 2432, 6, 1, 1, 1.063 > 2432, 1, 6, 0, 0.965 > 2432, 1, 6, 1, 0.965 > 2432, 38, 1, 0, 1.068 > 2432, 38, 1, 1, 1.068 > 2432, 1, 38, 0, 0.968 > 2432, 1, 38, 1, 0.968 > 2432, 2054, 1, 0, 1.06 > 2432, 2054, 1, 1, 1.06 > 2432, 2049, 6, 0, 0.963 > 2432, 2049, 6, 1, 0.963 > 2496, 0, 0, 0, 1.013 > 2496, 0, 0, 1, 1.013 > 2496, 7, 0, 0, 1.032 > 2496, 7, 0, 1, 1.032 > 2496, 39, 0, 0, 1.013 > 2496, 39, 0, 1, 1.013 > 2496, 0, 7, 0, 0.965 > 2496, 0, 7, 1, 0.965 > 2496, 0, 39, 0, 0.979 > 2496, 0, 39, 1, 0.979 > 2496, 7, 7, 0, 0.925 > 2496, 7, 7, 1, 0.925 > 2496, 39, 39, 0, 0.989 > 2496, 39, 39, 1, 0.989 > 2496, 2048, 0, 0, 1.013 > 2496, 2048, 0, 1, 1.013 > 2496, 2055, 0, 0, 1.032 > 2496, 2055, 0, 1, 1.032 > 2496, 2048, 7, 0, 0.792 > 2496, 2048, 7, 1, 0.792 > 2496, 2055, 7, 0, 0.93 > 2496, 2055, 7, 1, 0.93 > 2496, 7, 1, 0, 0.984 > 2496, 7, 1, 1, 0.984 > 2496, 1, 7, 0, 0.894 > 2496, 1, 7, 1, 0.895 > 2496, 39, 1, 0, 1.054 > 2496, 39, 1, 1, 1.054 > 2496, 1, 39, 0, 0.963 > 2496, 1, 39, 1, 0.963 > 2496, 2055, 1, 0, 1.049 > 2496, 2055, 1, 1, 1.049 > 2496, 2049, 7, 0, 0.953 > 2496, 2049, 7, 1, 0.953 > 2560, 0, 0, 0, 0.991 > 2560, 0, 0, 1, 0.991 > 2560, 8, 0, 0, 1.031 > 2560, 8, 0, 1, 1.032 > 2560, 40, 0, 0, 1.029 > 2560, 40, 0, 1, 1.029 > 2560, 0, 8, 0, 0.992 > 2560, 0, 8, 1, 0.992 > 2560, 0, 40, 0, 0.975 > 2560, 0, 40, 1, 0.984 > 2560, 8, 8, 0, 0.942 > 2560, 8, 8, 1, 0.943 > 2560, 40, 40, 0, 1.139 > 2560, 40, 40, 1, 1.139 > 2560, 2048, 0, 0, 0.993 > 2560, 2048, 0, 1, 0.993 > 2560, 2056, 0, 0, 1.032 > 2560, 2056, 0, 1, 1.032 > 2560, 2048, 8, 0, 0.812 > 2560, 2048, 8, 1, 0.812 > 2560, 2056, 8, 0, 0.912 > 2560, 2056, 8, 1, 0.912 > 2560, 8, 1, 0, 1.068 > 2560, 8, 1, 1, 1.069 > 2560, 1, 8, 0, 0.974 > 2560, 1, 8, 1, 0.974 > 2560, 40, 1, 0, 1.068 > 2560, 40, 1, 1, 1.068 > 2560, 1, 40, 0, 0.996 > 2560, 1, 40, 1, 0.996 > 2560, 2056, 1, 0, 1.063 > 2560, 2056, 1, 1, 1.063 > 2560, 2049, 8, 0, 0.969 > 2560, 2049, 8, 1, 0.969 > 2624, 0, 0, 0, 0.995 > 2624, 0, 0, 1, 0.994 > 2624, 9, 0, 0, 1.015 > 2624, 9, 0, 1, 1.018 > 2624, 41, 0, 0, 1.044 > 2624, 41, 0, 1, 1.044 > 2624, 0, 9, 0, 0.988 > 2624, 0, 9, 1, 0.99 > 2624, 0, 41, 0, 0.989 > 2624, 0, 41, 1, 0.99 > 2624, 9, 9, 0, 0.943 > 2624, 9, 9, 1, 0.943 > 2624, 41, 41, 0, 0.993 > 2624, 41, 41, 1, 0.993 > 2624, 2048, 0, 0, 0.998 > 2624, 2048, 0, 1, 0.998 > 2624, 2057, 0, 0, 1.018 > 2624, 2057, 0, 1, 1.018 > 2624, 2048, 9, 0, 0.81 > 2624, 2048, 9, 1, 0.81 > 2624, 2057, 9, 0, 0.907 > 2624, 2057, 9, 1, 0.907 > 2624, 9, 1, 0, 1.09 > 2624, 9, 1, 1, 1.09 > 2624, 1, 9, 0, 0.967 > 2624, 1, 9, 1, 0.967 > 2624, 41, 1, 0, 1.084 > 2624, 41, 1, 1, 1.085 > 2624, 1, 41, 0, 0.958 > 2624, 1, 41, 1, 0.957 > 2624, 2057, 1, 0, 1.087 > 2624, 2057, 1, 1, 1.087 > 2624, 2049, 9, 0, 0.965 > 2624, 2049, 9, 1, 0.965 > 2688, 0, 0, 0, 0.995 > 2688, 0, 0, 1, 0.995 > 2688, 10, 0, 0, 1.01 > 2688, 10, 0, 1, 1.012 > 2688, 42, 0, 0, 1.036 > 2688, 42, 0, 1, 1.036 > 2688, 0, 10, 0, 0.978 > 2688, 0, 10, 1, 0.979 > 2688, 0, 42, 0, 0.977 > 2688, 0, 42, 1, 0.978 > 2688, 10, 10, 0, 0.942 > 2688, 10, 10, 1, 0.942 > 2688, 42, 42, 0, 0.989 > 2688, 42, 42, 1, 0.989 > 2688, 2048, 0, 0, 0.995 > 2688, 2048, 0, 1, 0.995 > 2688, 2058, 0, 0, 1.012 > 2688, 2058, 0, 1, 1.012 > 2688, 2048, 10, 0, 0.804 > 2688, 2048, 10, 1, 0.804 > 2688, 2058, 10, 0, 0.905 > 2688, 2058, 10, 1, 0.905 > 2688, 10, 1, 0, 0.986 > 2688, 10, 1, 1, 0.987 > 2688, 1, 10, 0, 0.893 > 2688, 1, 10, 1, 0.894 > 2688, 42, 1, 0, 1.054 > 2688, 42, 1, 1, 1.054 > 2688, 1, 42, 0, 0.958 > 2688, 1, 42, 1, 0.958 > 2688, 2058, 1, 0, 1.052 > 2688, 2058, 1, 1, 1.052 > 2688, 2049, 10, 0, 0.954 > 2688, 2049, 10, 1, 0.954 > 2752, 0, 0, 0, 1.0 > 2752, 0, 0, 1, 0.992 > 2752, 11, 0, 0, 0.954 > 2752, 11, 0, 1, 0.954 > 2752, 43, 0, 0, 0.979 > 2752, 43, 0, 1, 0.979 > 2752, 0, 11, 0, 0.939 > 2752, 0, 11, 1, 0.939 > 2752, 0, 43, 0, 0.931 > 2752, 0, 43, 1, 0.932 > 2752, 11, 11, 0, 0.949 > 2752, 11, 11, 1, 0.949 > 2752, 43, 43, 0, 1.007 > 2752, 43, 43, 1, 1.007 > 2752, 2048, 0, 0, 0.993 > 2752, 2048, 0, 1, 0.993 > 2752, 2059, 0, 0, 0.954 > 2752, 2059, 0, 1, 0.954 > 2752, 2048, 11, 0, 0.77 > 2752, 2048, 11, 1, 0.77 > 2752, 2059, 11, 0, 0.916 > 2752, 2059, 11, 1, 0.916 > 2752, 11, 1, 0, 0.994 > 2752, 11, 1, 1, 0.994 > 2752, 1, 11, 0, 0.928 > 2752, 1, 11, 1, 0.928 > 2752, 43, 1, 0, 1.022 > 2752, 43, 1, 1, 1.022 > 2752, 1, 43, 0, 0.92 > 2752, 1, 43, 1, 0.92 > 2752, 2059, 1, 0, 0.989 > 2752, 2059, 1, 1, 0.989 > 2752, 2049, 11, 0, 0.923 > 2752, 2049, 11, 1, 0.923 > 2816, 0, 0, 0, 1.003 > 2816, 0, 0, 1, 1.003 > 2816, 12, 0, 0, 0.897 > 2816, 12, 0, 1, 0.894 > 2816, 44, 0, 0, 0.914 > 2816, 44, 0, 1, 0.914 > 2816, 0, 12, 0, 0.876 > 2816, 0, 12, 1, 0.874 > 2816, 0, 44, 0, 0.871 > 2816, 0, 44, 1, 0.87 > 2816, 12, 12, 0, 0.948 > 2816, 12, 12, 1, 0.948 > 2816, 44, 44, 0, 1.009 > 2816, 44, 44, 1, 1.009 > 2816, 2048, 0, 0, 1.005 > 2816, 2048, 0, 1, 1.005 > 2816, 2060, 0, 0, 0.894 > 2816, 2060, 0, 1, 0.894 > 2816, 2048, 12, 0, 0.714 > 2816, 2048, 12, 1, 0.713 > 2816, 2060, 12, 0, 0.915 > 2816, 2060, 12, 1, 0.915 > 2816, 12, 1, 0, 0.917 > 2816, 12, 1, 1, 0.917 > 2816, 1, 12, 0, 0.858 > 2816, 1, 12, 1, 0.857 > 2816, 44, 1, 0, 0.944 > 2816, 44, 1, 1, 0.943 > 2816, 1, 44, 0, 0.856 > 2816, 1, 44, 1, 0.856 > 2816, 2060, 1, 0, 0.914 > 2816, 2060, 1, 1, 0.914 > 2816, 2049, 12, 0, 0.855 > 2816, 2049, 12, 1, 0.855 > 2880, 0, 0, 0, 0.989 > 2880, 0, 0, 1, 0.989 > 2880, 13, 0, 0, 0.967 > 2880, 13, 0, 1, 0.967 > 2880, 45, 0, 0, 0.987 > 2880, 45, 0, 1, 0.987 > 2880, 0, 13, 0, 0.925 > 2880, 0, 13, 1, 0.925 > 2880, 0, 45, 0, 0.927 > 2880, 0, 45, 1, 0.927 > 2880, 13, 13, 0, 0.944 > 2880, 13, 13, 1, 0.944 > 2880, 45, 45, 0, 1.003 > 2880, 45, 45, 1, 1.003 > 2880, 2048, 0, 0, 0.989 > 2880, 2048, 0, 1, 0.989 > 2880, 2061, 0, 0, 0.967 > 2880, 2061, 0, 1, 0.967 > 2880, 2048, 13, 0, 0.76 > 2880, 2048, 13, 1, 0.76 > 2880, 2061, 13, 0, 0.91 > 2880, 2061, 13, 1, 0.91 > 2880, 13, 1, 0, 0.922 > 2880, 13, 1, 1, 0.922 > 2880, 1, 13, 0, 0.859 > 2880, 1, 13, 1, 0.859 > 2880, 45, 1, 0, 1.013 > 2880, 45, 1, 1, 1.013 > 2880, 1, 45, 0, 0.92 > 2880, 1, 45, 1, 0.92 > 2880, 2061, 1, 0, 0.984 > 2880, 2061, 1, 1, 0.984 > 2880, 2049, 13, 0, 0.918 > 2880, 2049, 13, 1, 0.918 > 2944, 0, 0, 0, 1.014 > 2944, 0, 0, 1, 1.014 > 2944, 14, 0, 0, 0.956 > 2944, 14, 0, 1, 0.955 > 2944, 46, 0, 0, 0.979 > 2944, 46, 0, 1, 0.979 > 2944, 0, 14, 0, 0.937 > 2944, 0, 14, 1, 0.937 > 2944, 0, 46, 0, 0.93 > 2944, 0, 46, 1, 0.93 > 2944, 14, 14, 0, 0.953 > 2944, 14, 14, 1, 0.953 > 2944, 46, 46, 0, 1.009 > 2944, 46, 46, 1, 1.009 > 2944, 2048, 0, 0, 1.015 > 2944, 2048, 0, 1, 1.015 > 2944, 2062, 0, 0, 0.955 > 2944, 2062, 0, 1, 0.955 > 2944, 2048, 14, 0, 0.769 > 2944, 2048, 14, 1, 0.769 > 2944, 2062, 14, 0, 0.923 > 2944, 2062, 14, 1, 0.923 > 2944, 14, 1, 0, 0.994 > 2944, 14, 1, 1, 0.994 > 2944, 1, 14, 0, 0.927 > 2944, 1, 14, 1, 0.927 > 2944, 46, 1, 0, 1.021 > 2944, 46, 1, 1, 1.021 > 2944, 1, 46, 0, 0.923 > 2944, 1, 46, 1, 0.923 > 2944, 2062, 1, 0, 0.988 > 2944, 2062, 1, 1, 0.988 > 2944, 2049, 14, 0, 0.922 > 2944, 2049, 14, 1, 0.922 > 3008, 0, 0, 0, 0.994 > 3008, 0, 0, 1, 0.994 > 3008, 15, 0, 0, 0.941 > 3008, 15, 0, 1, 0.941 > 3008, 47, 0, 0, 0.996 > 3008, 47, 0, 1, 0.996 > 3008, 0, 15, 0, 0.929 > 3008, 0, 15, 1, 0.933 > 3008, 0, 47, 0, 0.933 > 3008, 0, 47, 1, 0.933 > 3008, 15, 15, 0, 0.952 > 3008, 15, 15, 1, 0.949 > 3008, 47, 47, 0, 1.003 > 3008, 47, 47, 1, 1.003 > 3008, 2048, 0, 0, 0.998 > 3008, 2048, 0, 1, 0.998 > 3008, 2063, 0, 0, 0.941 > 3008, 2063, 0, 1, 0.941 > 3008, 2048, 15, 0, 0.766 > 3008, 2048, 15, 1, 0.766 > 3008, 2063, 15, 0, 0.916 > 3008, 2063, 15, 1, 0.916 > 3008, 15, 1, 0, 0.985 > 3008, 15, 1, 1, 0.985 > 3008, 1, 15, 0, 0.916 > 3008, 1, 15, 1, 0.916 > 3008, 47, 1, 0, 1.014 > 3008, 47, 1, 1, 1.014 > 3008, 1, 47, 0, 0.902 > 3008, 1, 47, 1, 0.902 > 3008, 2063, 1, 0, 0.981 > 3008, 2063, 1, 1, 0.981 > 3008, 2049, 15, 0, 0.912 > 3008, 2049, 15, 1, 0.913 > 3072, 0, 0, 0, 1.016 > 3072, 0, 0, 1, 1.015 > 3072, 16, 0, 0, 1.045 > 3072, 16, 0, 1, 1.045 > 3072, 48, 0, 0, 1.045 > 3072, 48, 0, 1, 1.045 > 3072, 0, 16, 0, 1.049 > 3072, 0, 16, 1, 1.049 > 3072, 0, 48, 0, 1.049 > 3072, 0, 48, 1, 1.049 > 3072, 16, 16, 0, 1.016 > 3072, 16, 16, 1, 1.016 > 3072, 48, 48, 0, 1.016 > 3072, 48, 48, 1, 1.016 > 3072, 2048, 0, 0, 1.016 > 3072, 2048, 0, 1, 1.016 > 3072, 2064, 0, 0, 1.045 > 3072, 2064, 0, 1, 1.045 > 3072, 2048, 16, 0, 1.049 > 3072, 2048, 16, 1, 1.049 > 3072, 2064, 16, 0, 1.016 > 3072, 2064, 16, 1, 1.016 > 3072, 16, 1, 0, 0.815 > 3072, 16, 1, 1, 0.815 > 3072, 1, 16, 0, 0.872 > 3072, 1, 16, 1, 0.872 > 3072, 48, 1, 0, 1.017 > 3072, 48, 1, 1, 1.017 > 3072, 1, 48, 0, 0.872 > 3072, 1, 48, 1, 0.872 > 3072, 2064, 1, 0, 0.815 > 3072, 2064, 1, 1, 0.815 > 3072, 2049, 16, 0, 0.872 > 3072, 2049, 16, 1, 0.872 > 3136, 0, 0, 0, 0.995 > 3136, 0, 0, 1, 0.995 > 3136, 17, 0, 0, 0.949 > 3136, 17, 0, 1, 0.949 > 3136, 49, 0, 0, 0.987 > 3136, 49, 0, 1, 0.987 > 3136, 0, 17, 0, 0.919 > 3136, 0, 17, 1, 0.917 > 3136, 0, 49, 0, 0.931 > 3136, 0, 49, 1, 0.931 > 3136, 17, 17, 0, 1.122 > 3136, 17, 17, 1, 1.119 > 3136, 49, 49, 0, 0.987 > 3136, 49, 49, 1, 0.987 > 3136, 2048, 0, 0, 0.997 > 3136, 2048, 0, 1, 0.997 > 3136, 2065, 0, 0, 0.949 > 3136, 2065, 0, 1, 0.949 > 3136, 2048, 17, 0, 0.896 > 3136, 2048, 17, 1, 0.896 > 3136, 2065, 17, 0, 1.122 > 3136, 2065, 17, 1, 1.119 > 3136, 17, 1, 0, 1.184 > 3136, 17, 1, 1, 1.184 > 3136, 1, 17, 0, 1.124 > 3136, 1, 17, 1, 1.125 > 3136, 49, 1, 0, 1.11 > 3136, 49, 1, 1, 1.108 > 3136, 1, 49, 0, 1.044 > 3136, 1, 49, 1, 1.044 > 3136, 2065, 1, 0, 1.147 > 3136, 2065, 1, 1, 1.147 > 3136, 2049, 17, 0, 1.102 > 3136, 2049, 17, 1, 1.1 > 3200, 0, 0, 0, 1.006 > 3200, 0, 0, 1, 1.006 > 3200, 18, 0, 0, 0.978 > 3200, 18, 0, 1, 0.978 > 3200, 50, 0, 0, 0.998 > 3200, 50, 0, 1, 0.998 > 3200, 0, 18, 0, 0.932 > 3200, 0, 18, 1, 0.932 > 3200, 0, 50, 0, 0.93 > 3200, 0, 50, 1, 0.93 > 3200, 18, 18, 0, 1.11 > 3200, 18, 18, 1, 1.11 > 3200, 50, 50, 0, 0.994 > 3200, 50, 50, 1, 0.994 > 3200, 2048, 0, 0, 1.007 > 3200, 2048, 0, 1, 1.007 > 3200, 2066, 0, 0, 0.978 > 3200, 2066, 0, 1, 0.978 > 3200, 2048, 18, 0, 0.894 > 3200, 2048, 18, 1, 0.894 > 3200, 2066, 18, 0, 1.11 > 3200, 2066, 18, 1, 1.11 > 3200, 18, 1, 0, 1.002 > 3200, 18, 1, 1, 1.002 > 3200, 1, 18, 0, 0.917 > 3200, 1, 18, 1, 0.917 > 3200, 50, 1, 0, 0.963 > 3200, 50, 1, 1, 0.964 > 3200, 1, 50, 0, 0.888 > 3200, 1, 50, 1, 0.888 > 3200, 2066, 1, 0, 1.002 > 3200, 2066, 1, 1, 1.002 > 3200, 2049, 18, 0, 0.914 > 3200, 2049, 18, 1, 0.914 > 3264, 0, 0, 0, 0.994 > 3264, 0, 0, 1, 0.994 > 3264, 19, 0, 0, 0.959 > 3264, 19, 0, 1, 0.959 > 3264, 51, 0, 0, 0.994 > 3264, 51, 0, 1, 0.994 > 3264, 0, 19, 0, 0.927 > 3264, 0, 19, 1, 0.927 > 3264, 0, 51, 0, 0.927 > 3264, 0, 51, 1, 0.927 > 3264, 19, 19, 0, 1.1 > 3264, 19, 19, 1, 1.1 > 3264, 51, 51, 0, 0.982 > 3264, 51, 51, 1, 0.982 > 3264, 2048, 0, 0, 0.994 > 3264, 2048, 0, 1, 0.994 > 3264, 2067, 0, 0, 0.959 > 3264, 2067, 0, 1, 0.959 > 3264, 2048, 19, 0, 0.891 > 3264, 2048, 19, 1, 0.891 > 3264, 2067, 19, 0, 1.099 > 3264, 2067, 19, 1, 1.099 > 3264, 19, 1, 0, 0.977 > 3264, 19, 1, 1, 0.976 > 3264, 1, 19, 0, 0.921 > 3264, 1, 19, 1, 0.921 > 3264, 51, 1, 0, 0.959 > 3264, 51, 1, 1, 0.959 > 3264, 1, 51, 0, 0.886 > 3264, 1, 51, 1, 0.886 > 3264, 2067, 1, 0, 0.976 > 3264, 2067, 1, 1, 0.976 > 3264, 2049, 19, 0, 0.917 > 3264, 2049, 19, 1, 0.917 > 3328, 0, 0, 0, 0.996 > 3328, 0, 0, 1, 0.992 > 3328, 20, 0, 0, 0.955 > 3328, 20, 0, 1, 0.955 > 3328, 52, 0, 0, 0.99 > 3328, 52, 0, 1, 0.99 > 3328, 0, 20, 0, 0.926 > 3328, 0, 20, 1, 0.923 > 3328, 0, 52, 0, 0.933 > 3328, 0, 52, 1, 0.933 > 3328, 20, 20, 0, 1.11 > 3328, 20, 20, 1, 1.11 > 3328, 52, 52, 0, 0.988 > 3328, 52, 52, 1, 0.988 > 3328, 2048, 0, 0, 0.993 > 3328, 2048, 0, 1, 0.993 > 3328, 2068, 0, 0, 0.955 > 3328, 2068, 0, 1, 0.955 > 3328, 2048, 20, 0, 0.9 > 3328, 2048, 20, 1, 0.9 > 3328, 2068, 20, 0, 1.109 > 3328, 2068, 20, 1, 1.109 > 3328, 20, 1, 0, 0.99 > 3328, 20, 1, 1, 0.99 > 3328, 1, 20, 0, 0.922 > 3328, 1, 20, 1, 0.922 > 3328, 52, 1, 0, 0.972 > 3328, 52, 1, 1, 0.972 > 3328, 1, 52, 0, 0.901 > 3328, 1, 52, 1, 0.901 > 3328, 2068, 1, 0, 0.99 > 3328, 2068, 1, 1, 0.99 > 3328, 2049, 20, 0, 0.918 > 3328, 2049, 20, 1, 0.918 > 3392, 0, 0, 0, 0.998 > 3392, 0, 0, 1, 1.0 > 3392, 21, 0, 0, 0.964 > 3392, 21, 0, 1, 0.964 > 3392, 53, 0, 0, 0.998 > 3392, 53, 0, 1, 0.998 > 3392, 0, 21, 0, 0.932 > 3392, 0, 21, 1, 0.932 > 3392, 0, 53, 0, 0.93 > 3392, 0, 53, 1, 0.93 > 3392, 21, 21, 0, 1.113 > 3392, 21, 21, 1, 1.113 > 3392, 53, 53, 0, 0.983 > 3392, 53, 53, 1, 0.983 > 3392, 2048, 0, 0, 1.0 > 3392, 2048, 0, 1, 1.0 > 3392, 2069, 0, 0, 0.964 > 3392, 2069, 0, 1, 0.964 > 3392, 2048, 21, 0, 0.895 > 3392, 2048, 21, 1, 0.896 > 3392, 2069, 21, 0, 1.113 > 3392, 2069, 21, 1, 1.113 > 3392, 21, 1, 0, 0.994 > 3392, 21, 1, 1, 0.994 > 3392, 1, 21, 0, 0.923 > 3392, 1, 21, 1, 0.923 > 3392, 53, 1, 0, 0.972 > 3392, 53, 1, 1, 0.972 > 3392, 1, 53, 0, 0.891 > 3392, 1, 53, 1, 0.891 > 3392, 2069, 1, 0, 0.994 > 3392, 2069, 1, 1, 0.994 > 3392, 2049, 21, 0, 0.922 > 3392, 2049, 21, 1, 0.922 > 3456, 0, 0, 0, 0.995 > 3456, 0, 0, 1, 0.995 > 3456, 22, 0, 0, 0.965 > 3456, 22, 0, 1, 0.965 > 3456, 54, 0, 0, 0.996 > 3456, 54, 0, 1, 0.996 > 3456, 0, 22, 0, 0.927 > 3456, 0, 22, 1, 0.927 > 3456, 0, 54, 0, 0.927 > 3456, 0, 54, 1, 0.927 > 3456, 22, 22, 0, 1.107 > 3456, 22, 22, 1, 1.107 > 3456, 54, 54, 0, 0.98 > 3456, 54, 54, 1, 0.98 > 3456, 2048, 0, 0, 0.995 > 3456, 2048, 0, 1, 0.995 > 3456, 2070, 0, 0, 0.965 > 3456, 2070, 0, 1, 0.965 > 3456, 2048, 22, 0, 0.893 > 3456, 2048, 22, 1, 0.893 > 3456, 2070, 22, 0, 1.107 > 3456, 2070, 22, 1, 1.107 > 3456, 22, 1, 0, 0.988 > 3456, 22, 1, 1, 0.988 > 3456, 1, 22, 0, 0.921 > 3456, 1, 22, 1, 0.921 > 3456, 54, 1, 0, 0.963 > 3456, 54, 1, 1, 0.963 > 3456, 1, 54, 0, 0.887 > 3456, 1, 54, 1, 0.887 > 3456, 2070, 1, 0, 0.988 > 3456, 2070, 1, 1, 0.988 > 3456, 2049, 22, 0, 0.917 > 3456, 2049, 22, 1, 0.917 > 3520, 0, 0, 0, 1.016 > 3520, 0, 0, 1, 1.016 > 3520, 23, 0, 0, 0.957 > 3520, 23, 0, 1, 0.957 > 3520, 55, 0, 0, 0.991 > 3520, 55, 0, 1, 0.991 > 3520, 0, 23, 0, 0.919 > 3520, 0, 23, 1, 0.924 > 3520, 0, 55, 0, 0.934 > 3520, 0, 55, 1, 0.934 > 3520, 23, 23, 0, 1.111 > 3520, 23, 23, 1, 1.111 > 3520, 55, 55, 0, 0.994 > 3520, 55, 55, 1, 0.994 > 3520, 2048, 0, 0, 1.016 > 3520, 2048, 0, 1, 1.016 > 3520, 2071, 0, 0, 0.957 > 3520, 2071, 0, 1, 0.957 > 3520, 2048, 23, 0, 0.903 > 3520, 2048, 23, 1, 0.903 > 3520, 2071, 23, 0, 1.111 > 3520, 2071, 23, 1, 1.111 > 3520, 23, 1, 0, 0.997 > 3520, 23, 1, 1, 0.997 > 3520, 1, 23, 0, 0.921 > 3520, 1, 23, 1, 0.921 > 3520, 55, 1, 0, 0.976 > 3520, 55, 1, 1, 0.976 > 3520, 1, 55, 0, 0.902 > 3520, 1, 55, 1, 0.902 > 3520, 2071, 1, 0, 0.997 > 3520, 2071, 1, 1, 0.997 > 3520, 2049, 23, 0, 0.918 > 3520, 2049, 23, 1, 0.918 > 3584, 0, 0, 0, 1.004 > 3584, 0, 0, 1, 1.004 > 3584, 24, 0, 0, 0.985 > 3584, 24, 0, 1, 0.979 > 3584, 56, 0, 0, 1.006 > 3584, 56, 0, 1, 1.006 > 3584, 0, 24, 0, 0.931 > 3584, 0, 24, 1, 0.931 > 3584, 0, 56, 0, 0.93 > 3584, 0, 56, 1, 0.93 > 3584, 24, 24, 0, 1.111 > 3584, 24, 24, 1, 1.11 > 3584, 56, 56, 0, 1.101 > 3584, 56, 56, 1, 1.1 > 3584, 2048, 0, 0, 1.005 > 3584, 2048, 0, 1, 1.005 > 3584, 2072, 0, 0, 0.98 > 3584, 2072, 0, 1, 0.978 > 3584, 2048, 24, 0, 0.896 > 3584, 2048, 24, 1, 0.897 > 3584, 2072, 24, 0, 1.111 > 3584, 2072, 24, 1, 1.111 > 3584, 24, 1, 0, 1.004 > 3584, 24, 1, 1, 1.004 > 3584, 1, 24, 0, 0.921 > 3584, 1, 24, 1, 0.921 > 3584, 56, 1, 0, 0.971 > 3584, 56, 1, 1, 0.97 > 3584, 1, 56, 0, 0.89 > 3584, 1, 56, 1, 0.89 > 3584, 2072, 1, 0, 1.004 > 3584, 2072, 1, 1, 1.004 > 3584, 2049, 24, 0, 0.918 > 3584, 2049, 24, 1, 0.918 > 3648, 0, 0, 0, 1.012 > 3648, 0, 0, 1, 1.012 > 3648, 25, 0, 0, 0.96 > 3648, 25, 0, 1, 0.96 > 3648, 57, 0, 0, 0.988 > 3648, 57, 0, 1, 0.988 > 3648, 0, 25, 0, 0.927 > 3648, 0, 25, 1, 0.927 > 3648, 0, 57, 0, 0.927 > 3648, 0, 57, 1, 0.927 > 3648, 25, 25, 0, 1.101 > 3648, 25, 25, 1, 1.101 > 3648, 57, 57, 0, 0.986 > 3648, 57, 57, 1, 0.986 > 3648, 2048, 0, 0, 1.012 > 3648, 2048, 0, 1, 1.012 > 3648, 2073, 0, 0, 0.96 > 3648, 2073, 0, 1, 0.959 > 3648, 2048, 25, 0, 0.894 > 3648, 2048, 25, 1, 0.895 > 3648, 2073, 25, 0, 1.103 > 3648, 2073, 25, 1, 1.103 > 3648, 25, 1, 0, 1.024 > 3648, 25, 1, 1, 1.024 > 3648, 1, 25, 0, 0.911 > 3648, 1, 25, 1, 0.912 > 3648, 57, 1, 0, 0.973 > 3648, 57, 1, 1, 0.974 > 3648, 1, 57, 0, 0.888 > 3648, 1, 57, 1, 0.888 > 3648, 2073, 1, 0, 1.024 > 3648, 2073, 1, 1, 1.024 > 3648, 2049, 25, 0, 0.907 > 3648, 2049, 25, 1, 0.907 > 3712, 0, 0, 0, 0.996 > 3712, 0, 0, 1, 0.996 > 3712, 26, 0, 0, 0.96 > 3712, 26, 0, 1, 0.96 > 3712, 58, 0, 0, 0.995 > 3712, 58, 0, 1, 0.995 > 3712, 0, 26, 0, 0.919 > 3712, 0, 26, 1, 0.918 > 3712, 0, 58, 0, 0.93 > 3712, 0, 58, 1, 0.93 > 3712, 26, 26, 0, 1.103 > 3712, 26, 26, 1, 1.102 > 3712, 58, 58, 0, 0.989 > 3712, 58, 58, 1, 0.989 > 3712, 2048, 0, 0, 0.997 > 3712, 2048, 0, 1, 0.997 > 3712, 2074, 0, 0, 0.959 > 3712, 2074, 0, 1, 0.959 > 3712, 2048, 26, 0, 0.901 > 3712, 2048, 26, 1, 0.901 > 3712, 2074, 26, 0, 1.104 > 3712, 2074, 26, 1, 1.102 > 3712, 26, 1, 0, 1.001 > 3712, 26, 1, 1, 1.001 > 3712, 1, 26, 0, 0.922 > 3712, 1, 26, 1, 0.922 > 3712, 58, 1, 0, 0.974 > 3712, 58, 1, 1, 0.974 > 3712, 1, 58, 0, 0.903 > 3712, 1, 58, 1, 0.903 > 3712, 2074, 1, 0, 1.001 > 3712, 2074, 1, 1, 1.001 > 3712, 2049, 26, 0, 0.919 > 3712, 2049, 26, 1, 0.919 > 3776, 0, 0, 0, 1.003 > 3776, 0, 0, 1, 1.003 > 3776, 27, 0, 0, 0.964 > 3776, 27, 0, 1, 0.964 > 3776, 59, 0, 0, 1.004 > 3776, 59, 0, 1, 1.004 > 3776, 0, 27, 0, 0.931 > 3776, 0, 27, 1, 0.931 > 3776, 0, 59, 0, 0.929 > 3776, 0, 59, 1, 0.93 > 3776, 27, 27, 0, 1.097 > 3776, 27, 27, 1, 1.097 > 3776, 59, 59, 0, 0.992 > 3776, 59, 59, 1, 0.992 > 3776, 2048, 0, 0, 1.003 > 3776, 2048, 0, 1, 1.003 > 3776, 2075, 0, 0, 0.963 > 3776, 2075, 0, 1, 0.964 > 3776, 2048, 27, 0, 0.898 > 3776, 2048, 27, 1, 0.898 > 3776, 2075, 27, 0, 1.097 > 3776, 2075, 27, 1, 1.097 > 3776, 27, 1, 0, 0.998 > 3776, 27, 1, 1, 0.998 > 3776, 1, 27, 0, 0.925 > 3776, 1, 27, 1, 0.925 > 3776, 59, 1, 0, 0.979 > 3776, 59, 1, 1, 0.979 > 3776, 1, 59, 0, 0.894 > 3776, 1, 59, 1, 0.894 > 3776, 2075, 1, 0, 0.998 > 3776, 2075, 1, 1, 0.999 > 3776, 2049, 27, 0, 0.923 > 3776, 2049, 27, 1, 0.923 > 3840, 0, 0, 0, 0.997 > 3840, 0, 0, 1, 0.997 > 3840, 28, 0, 0, 0.968 > 3840, 28, 0, 1, 0.968 > 3840, 60, 0, 0, 1.001 > 3840, 60, 0, 1, 1.001 > 3840, 0, 28, 0, 0.926 > 3840, 0, 28, 1, 0.927 > 3840, 0, 60, 0, 0.927 > 3840, 0, 60, 1, 0.927 > 3840, 28, 28, 0, 1.094 > 3840, 28, 28, 1, 1.094 > 3840, 60, 60, 0, 0.982 > 3840, 60, 60, 1, 0.982 > 3840, 2048, 0, 0, 0.998 > 3840, 2048, 0, 1, 0.998 > 3840, 2076, 0, 0, 0.968 > 3840, 2076, 0, 1, 0.968 > 3840, 2048, 28, 0, 0.896 > 3840, 2048, 28, 1, 0.896 > 3840, 2076, 28, 0, 1.094 > 3840, 2076, 28, 1, 1.094 > 3840, 28, 1, 0, 0.983 > 3840, 28, 1, 1, 0.982 > 3840, 1, 28, 0, 0.916 > 3840, 1, 28, 1, 0.916 > 3840, 60, 1, 0, 0.969 > 3840, 60, 1, 1, 0.969 > 3840, 1, 60, 0, 0.891 > 3840, 1, 60, 1, 0.891 > 3840, 2076, 1, 0, 0.983 > 3840, 2076, 1, 1, 0.983 > 3840, 2049, 28, 0, 0.912 > 3840, 2049, 28, 1, 0.912 > 3904, 0, 0, 0, 1.002 > 3904, 0, 0, 1, 1.0 > 3904, 29, 0, 0, 0.961 > 3904, 29, 0, 1, 0.961 > 3904, 61, 0, 0, 0.997 > 3904, 61, 0, 1, 0.997 > 3904, 0, 29, 0, 0.915 > 3904, 0, 29, 1, 0.922 > 3904, 0, 61, 0, 0.933 > 3904, 0, 61, 1, 0.933 > 3904, 29, 29, 0, 1.103 > 3904, 29, 29, 1, 1.103 > 3904, 61, 61, 0, 0.995 > 3904, 61, 61, 1, 0.995 > 3904, 2048, 0, 0, 0.998 > 3904, 2048, 0, 1, 1.0 > 3904, 2077, 0, 0, 0.961 > 3904, 2077, 0, 1, 0.961 > 3904, 2048, 29, 0, 0.904 > 3904, 2048, 29, 1, 0.904 > 3904, 2077, 29, 0, 1.103 > 3904, 2077, 29, 1, 1.103 > 3904, 29, 1, 0, 1.0 > 3904, 29, 1, 1, 1.0 > 3904, 1, 29, 0, 0.922 > 3904, 1, 29, 1, 0.922 > 3904, 61, 1, 0, 0.98 > 3904, 61, 1, 1, 0.98 > 3904, 1, 61, 0, 0.904 > 3904, 1, 61, 1, 0.904 > 3904, 2077, 1, 0, 1.0 > 3904, 2077, 1, 1, 1.0 > 3904, 2049, 29, 0, 0.919 > 3904, 2049, 29, 1, 0.919 > 3968, 0, 0, 0, 1.003 > 3968, 0, 0, 1, 1.003 > 3968, 30, 0, 0, 0.969 > 3968, 30, 0, 1, 0.969 > 3968, 62, 0, 0, 1.006 > 3968, 62, 0, 1, 1.006 > 3968, 0, 30, 0, 0.931 > 3968, 0, 30, 1, 0.93 > 3968, 0, 62, 0, 0.929 > 3968, 0, 62, 1, 0.929 > 3968, 30, 30, 0, 1.103 > 3968, 30, 30, 1, 1.103 > 3968, 62, 62, 0, 0.99 > 3968, 62, 62, 1, 0.99 > 3968, 2048, 0, 0, 1.004 > 3968, 2048, 0, 1, 1.004 > 3968, 2078, 0, 0, 0.969 > 3968, 2078, 0, 1, 0.969 > 3968, 2048, 30, 0, 0.899 > 3968, 2048, 30, 1, 0.899 > 3968, 2078, 30, 0, 1.105 > 3968, 2078, 30, 1, 1.105 > 3968, 30, 1, 0, 0.993 > 3968, 30, 1, 1, 0.993 > 3968, 1, 30, 0, 0.908 > 3968, 1, 30, 1, 0.908 > 3968, 62, 1, 0, 0.978 > 3968, 62, 1, 1, 0.978 > 3968, 1, 62, 0, 0.895 > 3968, 1, 62, 1, 0.895 > 3968, 2078, 1, 0, 0.993 > 3968, 2078, 1, 1, 0.993 > 3968, 2049, 30, 0, 0.904 > 3968, 2049, 30, 1, 0.904 > 4032, 0, 0, 0, 0.995 > 4032, 0, 0, 1, 0.995 > 4032, 31, 0, 0, 0.967 > 4032, 31, 0, 1, 0.967 > 4032, 63, 0, 0, 1.002 > 4032, 63, 0, 1, 1.002 > 4032, 0, 31, 0, 0.927 > 4032, 0, 31, 1, 0.926 > 4032, 0, 63, 0, 0.927 > 4032, 0, 63, 1, 0.927 > 4032, 31, 31, 0, 1.09 > 4032, 31, 31, 1, 1.09 > 4032, 63, 63, 0, 0.987 > 4032, 63, 63, 1, 0.987 > 4032, 2048, 0, 0, 0.995 > 4032, 2048, 0, 1, 0.995 > 4032, 2079, 0, 0, 0.967 > 4032, 2079, 0, 1, 0.967 > 4032, 2048, 31, 0, 0.897 > 4032, 2048, 31, 1, 0.897 > 4032, 2079, 31, 0, 1.09 > 4032, 2079, 31, 1, 1.09 > 4032, 31, 1, 0, 0.989 > 4032, 31, 1, 1, 0.989 > 4032, 1, 31, 0, 0.911 > 4032, 1, 31, 1, 0.911 > 4032, 63, 1, 0, 0.971 > 4032, 63, 1, 1, 0.972 > 4032, 1, 63, 0, 0.892 > 4032, 1, 63, 1, 0.892 > 4032, 2079, 1, 0, 0.989 > 4032, 2079, 1, 1, 0.989 > 4032, 2049, 31, 0, 0.907 > 4032, 2049, 31, 1, 0.907 > 4096, 32, 0, 0, 1.014 > 4096, 32, 0, 1, 1.014 > 4096, 64, 0, 0, 1.014 > 4096, 64, 0, 1, 1.014 > 4096, 0, 32, 0, 1.012 > 4096, 0, 32, 1, 1.012 > 4096, 0, 64, 0, 1.012 > 4096, 0, 64, 1, 1.012 > 4096, 32, 32, 0, 1.014 > 4096, 32, 32, 1, 1.014 > 4096, 64, 64, 0, 1.014 > 4096, 64, 64, 1, 1.014 > 4096, 2080, 0, 0, 1.014 > 4096, 2080, 0, 1, 1.014 > 4096, 2048, 32, 0, 1.014 > 4096, 2048, 32, 1, 1.014 > 4096, 2080, 32, 0, 1.014 > 4096, 2080, 32, 1, 1.014 > 4096, 32, 1, 0, 0.975 > 4096, 32, 1, 1, 0.975 > 4096, 1, 32, 0, 0.769 > 4096, 1, 32, 1, 0.769 > 4096, 64, 1, 0, 0.858 > 4096, 64, 1, 1, 0.858 > 4096, 1, 64, 0, 0.769 > 4096, 1, 64, 1, 0.769 > 4096, 2080, 1, 0, 0.829 > 4096, 2080, 1, 1, 0.829 > 4096, 2049, 32, 0, 0.886 > 4096, 2049, 32, 1, 0.886 > 4160, 0, 0, 0, 1.003 > 4160, 0, 0, 1, 1.003 > 4160, 33, 0, 0, 1.004 > 4160, 33, 0, 1, 1.004 > 4160, 65, 0, 0, 0.999 > 4160, 65, 0, 1, 0.999 > 4160, 0, 33, 0, 0.931 > 4160, 0, 33, 1, 0.931 > 4160, 0, 65, 0, 0.765 > 4160, 0, 65, 1, 0.765 > 4160, 33, 33, 0, 0.998 > 4160, 33, 33, 1, 0.998 > 4160, 65, 65, 0, 0.942 > 4160, 65, 65, 1, 0.942 > 4160, 2048, 0, 0, 1.003 > 4160, 2048, 0, 1, 1.003 > 4160, 2081, 0, 0, 1.004 > 4160, 2081, 0, 1, 1.004 > 4160, 2048, 33, 0, 0.899 > 4160, 2048, 33, 1, 0.898 > 4160, 2081, 33, 0, 1.002 > 4160, 2081, 33, 1, 1.002 > 4160, 33, 1, 0, 1.114 > 4160, 33, 1, 1, 1.114 > 4160, 1, 33, 0, 1.01 > 4160, 1, 33, 1, 1.01 > 4160, 65, 1, 0, 1.077 > 4160, 65, 1, 1, 1.077 > 4160, 1, 65, 0, 0.935 > 4160, 1, 65, 1, 0.935 > 4160, 2081, 1, 0, 1.077 > 4160, 2081, 1, 1, 1.077 > 4160, 2049, 33, 0, 1.007 > 4160, 2049, 33, 1, 1.007 > 4224, 0, 0, 0, 1.014 > 4224, 0, 0, 1, 1.014 > 4224, 34, 0, 0, 1.0 > 4224, 34, 0, 1, 1.0 > 4224, 66, 0, 0, 1.001 > 4224, 66, 0, 1, 1.001 > 4224, 0, 34, 0, 0.928 > 4224, 0, 34, 1, 0.928 > 4224, 0, 66, 0, 0.762 > 4224, 0, 66, 1, 0.762 > 4224, 34, 34, 0, 0.998 > 4224, 34, 34, 1, 0.998 > 4224, 66, 66, 0, 0.959 > 4224, 66, 66, 1, 0.959 > 4224, 2048, 0, 0, 1.014 > 4224, 2048, 0, 1, 1.014 > 4224, 2082, 0, 0, 1.001 > 4224, 2082, 0, 1, 1.001 > 4224, 2048, 34, 0, 0.899 > 4224, 2048, 34, 1, 0.898 > 4224, 2082, 34, 0, 0.998 > 4224, 2082, 34, 1, 0.998 > 4224, 34, 1, 0, 1.024 > 4224, 34, 1, 1, 1.023 > 4224, 1, 34, 0, 0.917 > 4224, 1, 34, 1, 0.917 > 4224, 66, 1, 0, 1.012 > 4224, 66, 1, 1, 1.013 > 4224, 1, 66, 0, 0.917 > 4224, 1, 66, 1, 0.917 > 4224, 2082, 1, 0, 1.022 > 4224, 2082, 1, 1, 1.022 > 4224, 2049, 34, 0, 0.914 > 4224, 2049, 34, 1, 0.914 > 4288, 0, 0, 0, 0.999 > 4288, 0, 0, 1, 0.999 > 4288, 35, 0, 0, 0.995 > 4288, 35, 0, 1, 0.996 > 4288, 67, 0, 0, 0.998 > 4288, 67, 0, 1, 0.998 > 4288, 0, 35, 0, 0.919 > 4288, 0, 35, 1, 0.918 > 4288, 0, 67, 0, 0.767 > 4288, 0, 67, 1, 0.767 > 4288, 35, 35, 0, 1.005 > 4288, 35, 35, 1, 1.004 > 4288, 67, 67, 0, 0.995 > 4288, 67, 67, 1, 0.995 > 4288, 2048, 0, 0, 0.999 > 4288, 2048, 0, 1, 0.999 > 4288, 2083, 0, 0, 0.995 > 4288, 2083, 0, 1, 0.995 > 4288, 2048, 35, 0, 0.905 > 4288, 2048, 35, 1, 0.904 > 4288, 2083, 35, 0, 1.005 > 4288, 2083, 35, 1, 1.004 > 4288, 35, 1, 0, 1.033 > 4288, 35, 1, 1, 1.032 > 4288, 1, 35, 0, 0.928 > 4288, 1, 35, 1, 0.928 > 4288, 67, 1, 0, 1.019 > 4288, 67, 1, 1, 1.02 > 4288, 1, 67, 0, 0.925 > 4288, 1, 67, 1, 0.924 > 4288, 2083, 1, 0, 1.03 > 4288, 2083, 1, 1, 1.03 > 4288, 2049, 35, 0, 0.925 > 4288, 2049, 35, 1, 0.926 > 4352, 0, 0, 0, 1.005 > 4352, 0, 0, 1, 1.005 > 4352, 36, 0, 0, 1.007 > 4352, 36, 0, 1, 1.006 > 4352, 68, 0, 0, 1.007 > 4352, 68, 0, 1, 1.008 > 4352, 0, 36, 0, 0.929 > 4352, 0, 36, 1, 0.929 > 4352, 0, 68, 0, 0.766 > 4352, 0, 68, 1, 0.766 > 4352, 36, 36, 0, 0.998 > 4352, 36, 36, 1, 0.998 > 4352, 68, 68, 0, 0.964 > 4352, 68, 68, 1, 0.964 > 4352, 2048, 0, 0, 1.006 > 4352, 2048, 0, 1, 1.006 > 4352, 2084, 0, 0, 1.006 > 4352, 2084, 0, 1, 1.006 > 4352, 2048, 36, 0, 0.897 > 4352, 2048, 36, 1, 0.898 > 4352, 2084, 36, 0, 0.998 > 4352, 2084, 36, 1, 0.998 > 4352, 36, 1, 0, 1.031 > 4352, 36, 1, 1, 1.031 > 4352, 1, 36, 0, 0.924 > 4352, 1, 36, 1, 0.924 > 4352, 68, 1, 0, 0.999 > 4352, 68, 1, 1, 0.999 > 4352, 1, 68, 0, 0.922 > 4352, 1, 68, 1, 0.922 > 4352, 2084, 1, 0, 1.03 > 4352, 2084, 1, 1, 1.03 > 4352, 2049, 36, 0, 0.922 > 4352, 2049, 36, 1, 0.922 > 4416, 0, 0, 0, 0.997 > 4416, 0, 0, 1, 0.997 > 4416, 37, 0, 0, 1.002 > 4416, 37, 0, 1, 1.002 > 4416, 69, 0, 0, 1.004 > 4416, 69, 0, 1, 1.004 > 4416, 0, 37, 0, 0.928 > 4416, 0, 37, 1, 0.927 > 4416, 0, 69, 0, 0.762 > 4416, 0, 69, 1, 0.762 > 4416, 37, 37, 0, 0.994 > 4416, 37, 37, 1, 0.994 > 4416, 69, 69, 0, 0.959 > 4416, 69, 69, 1, 0.959 > 4416, 2048, 0, 0, 0.997 > 4416, 2048, 0, 1, 0.997 > 4416, 2085, 0, 0, 1.001 > 4416, 2085, 0, 1, 1.001 > 4416, 2048, 37, 0, 0.899 > 4416, 2048, 37, 1, 0.899 > 4416, 2085, 37, 0, 0.994 > 4416, 2085, 37, 1, 0.994 > 4416, 37, 1, 0, 1.024 > 4416, 37, 1, 1, 1.023 > 4416, 1, 37, 0, 0.923 > 4416, 1, 37, 1, 0.922 > 4416, 69, 1, 0, 1.009 > 4416, 69, 1, 1, 1.01 > 4416, 1, 69, 0, 0.917 > 4416, 1, 69, 1, 0.917 > 4416, 2085, 1, 0, 1.024 > 4416, 2085, 1, 1, 1.024 > 4416, 2049, 37, 0, 0.919 > 4416, 2049, 37, 1, 0.919 > 4480, 0, 0, 0, 1.0 > 4480, 0, 0, 1, 0.999 > 4480, 38, 0, 0, 0.996 > 4480, 38, 0, 1, 0.996 > 4480, 70, 0, 0, 1.0 > 4480, 70, 0, 1, 1.0 > 4480, 0, 38, 0, 0.919 > 4480, 0, 38, 1, 0.921 > 4480, 0, 70, 0, 0.767 > 4480, 0, 70, 1, 0.767 > 4480, 38, 38, 0, 1.002 > 4480, 38, 38, 1, 1.002 > 4480, 70, 70, 0, 0.963 > 4480, 70, 70, 1, 0.963 > 4480, 2048, 0, 0, 0.998 > 4480, 2048, 0, 1, 0.999 > 4480, 2086, 0, 0, 0.996 > 4480, 2086, 0, 1, 0.995 > 4480, 2048, 38, 0, 0.907 > 4480, 2048, 38, 1, 0.907 > 4480, 2086, 38, 0, 1.002 > 4480, 2086, 38, 1, 1.002 > 4480, 38, 1, 0, 1.032 > 4480, 38, 1, 1, 1.031 > 4480, 1, 38, 0, 0.919 > 4480, 1, 38, 1, 0.92 > 4480, 70, 1, 0, 1.018 > 4480, 70, 1, 1, 1.017 > 4480, 1, 70, 0, 0.916 > 4480, 1, 70, 1, 0.915 > 4480, 2086, 1, 0, 1.031 > 4480, 2086, 1, 1, 1.03 > 4480, 2049, 38, 0, 0.917 > 4480, 2049, 38, 1, 0.918 > 4544, 0, 0, 0, 1.002 > 4544, 0, 0, 1, 1.002 > 4544, 39, 0, 0, 1.007 > 4544, 39, 0, 1, 1.008 > 4544, 71, 0, 0, 1.002 > 4544, 71, 0, 1, 1.002 > 4544, 0, 39, 0, 0.93 > 4544, 0, 39, 1, 0.931 > 4544, 0, 71, 0, 0.766 > 4544, 0, 71, 1, 0.766 > 4544, 39, 39, 0, 1.001 > 4544, 39, 39, 1, 1.001 > 4544, 71, 71, 0, 0.966 > 4544, 71, 71, 1, 0.966 > 4544, 2048, 0, 0, 1.002 > 4544, 2048, 0, 1, 1.002 > 4544, 2087, 0, 0, 1.008 > 4544, 2087, 0, 1, 1.007 > 4544, 2048, 39, 0, 0.901 > 4544, 2048, 39, 1, 0.901 > 4544, 2087, 39, 0, 1.001 > 4544, 2087, 39, 1, 1.001 > 4544, 39, 1, 0, 1.025 > 4544, 39, 1, 1, 1.025 > 4544, 1, 39, 0, 0.919 > 4544, 1, 39, 1, 0.919 > 4544, 71, 1, 0, 0.991 > 4544, 71, 1, 1, 0.991 > 4544, 1, 71, 0, 0.921 > 4544, 1, 71, 1, 0.922 > 4544, 2087, 1, 0, 1.025 > 4544, 2087, 1, 1, 1.025 > 4544, 2049, 39, 0, 0.917 > 4544, 2049, 39, 1, 0.917 > 4608, 0, 0, 0, 0.997 > 4608, 0, 0, 1, 0.997 > 4608, 40, 0, 0, 1.013 > 4608, 40, 0, 1, 1.013 > 4608, 72, 0, 0, 1.013 > 4608, 72, 0, 1, 1.013 > 4608, 0, 40, 0, 0.925 > 4608, 0, 40, 1, 0.926 > 4608, 0, 72, 0, 0.765 > 4608, 0, 72, 1, 0.765 > 4608, 40, 40, 0, 1.084 > 4608, 40, 40, 1, 1.084 > 4608, 72, 72, 0, 0.966 > 4608, 72, 72, 1, 0.966 > 4608, 2048, 0, 0, 0.999 > 4608, 2048, 0, 1, 0.999 > 4608, 2088, 0, 0, 1.012 > 4608, 2088, 0, 1, 1.012 > 4608, 2048, 40, 0, 0.898 > 4608, 2048, 40, 1, 0.898 > 4608, 2088, 40, 0, 1.087 > 4608, 2088, 40, 1, 1.087 > 4608, 40, 1, 0, 1.006 > 4608, 40, 1, 1, 1.006 > 4608, 1, 40, 0, 0.926 > 4608, 1, 40, 1, 0.925 > 4608, 72, 1, 0, 1.012 > 4608, 72, 1, 1, 1.011 > 4608, 1, 72, 0, 0.92 > 4608, 1, 72, 1, 0.92 > 4608, 2088, 1, 0, 1.006 > 4608, 2088, 1, 1, 1.006 > 4608, 2049, 40, 0, 0.923 > 4608, 2049, 40, 1, 0.923 > 4672, 0, 0, 0, 1.014 > 4672, 0, 0, 1, 1.014 > 4672, 41, 0, 0, 1.003 > 4672, 41, 0, 1, 1.003 > 4672, 73, 0, 0, 0.983 > 4672, 73, 0, 1, 0.982 > 4672, 0, 41, 0, 0.916 > 4672, 0, 41, 1, 0.918 > 4672, 0, 73, 0, 0.772 > 4672, 0, 73, 1, 0.772 > 4672, 41, 41, 0, 1.012 > 4672, 41, 41, 1, 1.012 > 4672, 73, 73, 0, 0.973 > 4672, 73, 73, 1, 0.973 > 4672, 2048, 0, 0, 1.014 > 4672, 2048, 0, 1, 1.014 > 4672, 2089, 0, 0, 1.002 > 4672, 2089, 0, 1, 1.002 > 4672, 2048, 41, 0, 0.907 > 4672, 2048, 41, 1, 0.908 > 4672, 2089, 41, 0, 1.012 > 4672, 2089, 41, 1, 1.012 > 4672, 41, 1, 0, 1.027 > 4672, 41, 1, 1, 1.027 > 4672, 1, 41, 0, 0.928 > 4672, 1, 41, 1, 0.927 > 4672, 73, 1, 0, 1.032 > 4672, 73, 1, 1, 1.03 > 4672, 1, 73, 0, 0.927 > 4672, 1, 73, 1, 0.927 > 4672, 2089, 1, 0, 1.026 > 4672, 2089, 1, 1, 1.027 > 4672, 2049, 41, 0, 0.925 > 4672, 2049, 41, 1, 0.925 > 4736, 0, 0, 0, 1.005 > 4736, 0, 0, 1, 1.005 > 4736, 42, 0, 0, 1.012 > 4736, 42, 0, 1, 1.012 > 4736, 74, 0, 0, 0.976 > 4736, 74, 0, 1, 0.975 > 4736, 0, 42, 0, 0.93 > 4736, 0, 42, 1, 0.93 > 4736, 0, 74, 0, 0.77 > 4736, 0, 74, 1, 0.77 > 4736, 42, 42, 0, 1.007 > 4736, 42, 42, 1, 1.007 > 4736, 74, 74, 0, 0.965 > 4736, 74, 74, 1, 0.965 > 4736, 2048, 0, 0, 1.006 > 4736, 2048, 0, 1, 1.006 > 4736, 2090, 0, 0, 1.013 > 4736, 2090, 0, 1, 1.013 > 4736, 2048, 42, 0, 0.902 > 4736, 2048, 42, 1, 0.902 > 4736, 2090, 42, 0, 1.007 > 4736, 2090, 42, 1, 1.007 > 4736, 42, 1, 0, 1.032 > 4736, 42, 1, 1, 1.032 > 4736, 1, 42, 0, 0.925 > 4736, 1, 42, 1, 0.925 > 4736, 74, 1, 0, 1.018 > 4736, 74, 1, 1, 1.018 > 4736, 1, 74, 0, 0.912 > 4736, 1, 74, 1, 0.912 > 4736, 2090, 1, 0, 1.032 > 4736, 2090, 1, 1, 1.032 > 4736, 2049, 42, 0, 0.923 > 4736, 2049, 42, 1, 0.923 > 4800, 0, 0, 0, 1.012 > 4800, 0, 0, 1, 1.012 > 4800, 43, 0, 0, 1.008 > 4800, 43, 0, 1, 1.008 > 4800, 75, 0, 0, 0.99 > 4800, 75, 0, 1, 0.99 > 4800, 0, 43, 0, 0.928 > 4800, 0, 43, 1, 0.928 > 4800, 0, 75, 0, 0.767 > 4800, 0, 75, 1, 0.768 > 4800, 43, 43, 0, 1.004 > 4800, 43, 43, 1, 1.004 > 4800, 75, 75, 0, 0.965 > 4800, 75, 75, 1, 0.965 > 4800, 2048, 0, 0, 1.012 > 4800, 2048, 0, 1, 1.012 > 4800, 2091, 0, 0, 1.009 > 4800, 2091, 0, 1, 1.008 > 4800, 2048, 43, 0, 0.902 > 4800, 2048, 43, 1, 0.902 > 4800, 2091, 43, 0, 1.004 > 4800, 2091, 43, 1, 1.004 > 4800, 43, 1, 0, 1.026 > 4800, 43, 1, 1, 1.025 > 4800, 1, 43, 0, 0.91 > 4800, 1, 43, 1, 0.91 > 4800, 75, 1, 0, 0.992 > 4800, 75, 1, 1, 0.992 > 4800, 1, 75, 0, 0.921 > 4800, 1, 75, 1, 0.92 > 4800, 2091, 1, 0, 1.025 > 4800, 2091, 1, 1, 1.025 > 4800, 2049, 43, 0, 0.907 > 4800, 2049, 43, 1, 0.907 > 4864, 0, 0, 0, 0.998 > 4864, 0, 0, 1, 0.998 > 4864, 44, 0, 0, 1.003 > 4864, 44, 0, 1, 1.004 > 4864, 76, 0, 0, 0.987 > 4864, 76, 0, 1, 0.987 > 4864, 0, 44, 0, 0.92 > 4864, 0, 44, 1, 0.921 > 4864, 0, 76, 0, 0.933 > 4864, 0, 76, 1, 0.932 > 4864, 44, 44, 0, 1.006 > 4864, 44, 44, 1, 1.004 > 4864, 76, 76, 0, 0.976 > 4864, 76, 76, 1, 0.975 > 4864, 2048, 0, 0, 0.999 > 4864, 2048, 0, 1, 0.999 > 4864, 2092, 0, 0, 1.004 > 4864, 2092, 0, 1, 1.005 > 4864, 2048, 44, 0, 0.907 > 4864, 2048, 44, 1, 0.907 > 4864, 2092, 44, 0, 1.006 > 4864, 2092, 44, 1, 1.005 > 4864, 44, 1, 0, 1.034 > 4864, 44, 1, 1, 1.032 > 4864, 1, 44, 0, 0.908 > 4864, 1, 44, 1, 0.929 > 4864, 76, 1, 0, 1.006 > 4864, 76, 1, 1, 1.005 > 4864, 1, 76, 0, 0.798 > 4864, 1, 76, 1, 0.798 > 4864, 2092, 1, 0, 1.033 > 4864, 2092, 1, 1, 1.033 > 4864, 2049, 44, 0, 0.904 > 4864, 2049, 44, 1, 0.925 > 4928, 0, 0, 0, 1.005 > 4928, 0, 0, 1, 1.005 > 4928, 45, 0, 0, 0.993 > 4928, 45, 0, 1, 1.012 > 4928, 77, 0, 0, 0.956 > 4928, 77, 0, 1, 0.976 > 4928, 0, 45, 0, 0.933 > 4928, 0, 45, 1, 0.932 > 4928, 0, 77, 0, 0.771 > 4928, 0, 77, 1, 0.771 > 4928, 45, 45, 0, 1.015 > 4928, 45, 45, 1, 1.015 > 4928, 77, 77, 0, 0.972 > 4928, 77, 77, 1, 0.972 > 4928, 2048, 0, 0, 1.005 > 4928, 2048, 0, 1, 1.005 > 4928, 2093, 0, 0, 0.992 > 4928, 2093, 0, 1, 1.012 > 4928, 2048, 45, 0, 0.932 > 4928, 2048, 45, 1, 0.931 > 4928, 2093, 45, 0, 1.015 > 4928, 2093, 45, 1, 1.015 > 4928, 45, 1, 0, 1.009 > 4928, 45, 1, 1, 1.032 > 4928, 1, 45, 0, 0.806 > 4928, 1, 45, 1, 0.805 > 4928, 77, 1, 0, 0.981 > 4928, 77, 1, 1, 1.005 > 4928, 1, 77, 0, 0.917 > 4928, 1, 77, 1, 0.917 > 4928, 2093, 1, 0, 1.008 > 4928, 2093, 1, 1, 1.032 > 4928, 2049, 45, 0, 0.794 > 4928, 2049, 45, 1, 0.794 > 4992, 0, 0, 0, 0.999 > 4992, 0, 0, 1, 0.999 > 4992, 46, 0, 0, 0.985 > 4992, 46, 0, 1, 1.008 > 4992, 78, 0, 0, 0.963 > 4992, 78, 0, 1, 0.984 > 4992, 0, 46, 0, 0.908 > 4992, 0, 46, 1, 0.908 > 4992, 0, 78, 0, 0.752 > 4992, 0, 78, 1, 0.751 > 4992, 46, 46, 0, 0.997 > 4992, 46, 46, 1, 0.997 > 4992, 78, 78, 0, 0.969 > 4992, 78, 78, 1, 0.968 > 4992, 2048, 0, 0, 1.0 > 4992, 2048, 0, 1, 1.0 > 4992, 2094, 0, 0, 0.987 > 4992, 2094, 0, 1, 1.008 > 4992, 2048, 46, 0, 0.883 > 4992, 2048, 46, 1, 0.883 > 4992, 2094, 46, 0, 0.997 > 4992, 2094, 46, 1, 0.997 > 4992, 46, 1, 0, 0.998 > 4992, 46, 1, 1, 1.02 > 4992, 1, 46, 0, 0.917 > 4992, 1, 46, 1, 0.917 > 4992, 78, 1, 0, 0.972 > 4992, 78, 1, 1, 0.993 > 4992, 1, 78, 0, 0.919 > 4992, 1, 78, 1, 0.92 > 4992, 2094, 1, 0, 0.997 > 4992, 2094, 1, 1, 1.019 > 4992, 2049, 46, 0, 0.914 > 4992, 2049, 46, 1, 0.914 > 5056, 0, 0, 0, 1.002 > 5056, 0, 0, 1, 1.0 > 5056, 47, 0, 0, 1.005 > 5056, 47, 0, 1, 1.005 > 5056, 79, 0, 0, 0.989 > 5056, 79, 0, 1, 0.989 > 5056, 0, 47, 0, 0.918 > 5056, 0, 47, 1, 0.919 > 5056, 0, 79, 0, 0.772 > 5056, 0, 79, 1, 0.771 > 5056, 47, 47, 0, 1.006 > 5056, 47, 47, 1, 1.006 > 5056, 79, 79, 0, 0.972 > 5056, 79, 79, 1, 0.972 > 5056, 2048, 0, 0, 1.001 > 5056, 2048, 0, 1, 1.0 > 5056, 2095, 0, 0, 1.004 > 5056, 2095, 0, 1, 1.004 > 5056, 2048, 47, 0, 0.908 > 5056, 2048, 47, 1, 0.909 > 5056, 2095, 47, 0, 1.006 > 5056, 2095, 47, 1, 1.006 > 5056, 47, 1, 0, 1.033 > 5056, 47, 1, 1, 1.033 > 5056, 1, 47, 0, 0.919 > 5056, 1, 47, 1, 0.919 > 5056, 79, 1, 0, 1.003 > 5056, 79, 1, 1, 1.005 > 5056, 1, 79, 0, 0.921 > 5056, 1, 79, 1, 0.921 > 5056, 2095, 1, 0, 1.032 > 5056, 2095, 1, 1, 1.034 > 5056, 2049, 47, 0, 0.918 > 5056, 2049, 47, 1, 0.917 > 5120, 0, 0, 0, 1.003 > 5120, 0, 0, 1, 1.003 > 5120, 48, 0, 0, 1.068 > 5120, 48, 0, 1, 1.068 > 5120, 80, 0, 0, 1.068 > 5120, 80, 0, 1, 1.068 > 5120, 0, 48, 0, 1.065 > 5120, 0, 48, 1, 1.065 > 5120, 0, 80, 0, 1.064 > 5120, 0, 80, 1, 1.065 > 5120, 48, 48, 0, 1.004 > 5120, 48, 48, 1, 1.004 > 5120, 80, 80, 0, 1.005 > 5120, 80, 80, 1, 1.005 > 5120, 2048, 0, 0, 1.005 > 5120, 2048, 0, 1, 1.005 > 5120, 2096, 0, 0, 1.068 > 5120, 2096, 0, 1, 1.068 > 5120, 2048, 48, 0, 1.065 > 5120, 2048, 48, 1, 1.065 > 5120, 2096, 48, 0, 1.005 > 5120, 2096, 48, 1, 1.005 > 5120, 48, 1, 0, 1.033 > 5120, 48, 1, 1, 1.031 > 5120, 1, 48, 0, 0.898 > 5120, 1, 48, 1, 0.898 > 5120, 80, 1, 0, 0.844 > 5120, 80, 1, 1, 0.844 > 5120, 1, 80, 0, 0.898 > 5120, 1, 80, 1, 0.898 > 5120, 2096, 1, 0, 0.856 > 5120, 2096, 1, 1, 0.855 > 5120, 2049, 48, 0, 0.898 > 5120, 2049, 48, 1, 0.898 > > bench-memcpy-random: > > length, New Time / Old Time > 32768, 0.866 > 65536, 0.891 > 131072, 0.896 > 262144, 0.901 > 524288, 0.904 > 1048576, 0.913 > > bench-memcpy-large: > > length, align0, align1, dst>src, New Time/Old Time > 65543, 0, 0, 0, 0.981 > 65543, 0, 0, 1, 0.981 > 65551, 0, 3, 0, 1.012 > 65551, 0, 3, 1, 1.013 > 65567, 3, 0, 0, 1.019 > 65567, 3, 0, 1, 1.02 > 65599, 3, 5, 0, 1.058 > 65599, 3, 5, 1, 1.061 > 65536, 0, 127, 0, 1.046 > 65536, 0, 127, 1, 1.046 > 65536, 0, 255, 0, 1.071 > 65536, 0, 255, 1, 1.071 > 65536, 0, 256, 0, 0.983 > 65536, 0, 256, 1, 0.984 > 65536, 0, 4064, 0, 1.017 > 65536, 0, 4064, 1, 1.018 > 131079, 0, 0, 0, 0.981 > 131079, 0, 0, 1, 0.981 > 131087, 0, 3, 0, 1.017 > 131087, 0, 3, 1, 1.017 > 131103, 3, 0, 0, 1.022 > 131103, 3, 0, 1, 1.022 > 131135, 3, 5, 0, 1.064 > 131135, 3, 5, 1, 1.065 > 131072, 0, 127, 0, 1.05 > 131072, 0, 127, 1, 1.05 > 131072, 0, 255, 0, 1.074 > 131072, 0, 255, 1, 1.074 > 131072, 0, 256, 0, 0.984 > 131072, 0, 256, 1, 0.984 > 131072, 0, 4064, 0, 1.018 > 131072, 0, 4064, 1, 1.019 > 262151, 0, 0, 0, 0.985 > 262151, 0, 0, 1, 0.985 > 262159, 0, 3, 0, 1.026 > 262159, 0, 3, 1, 1.026 > 262175, 3, 0, 0, 1.03 > 262175, 3, 0, 1, 1.03 > 262207, 3, 5, 0, 1.07 > 262207, 3, 5, 1, 1.07 > 262144, 0, 127, 0, 1.057 > 262144, 0, 127, 1, 1.057 > 262144, 0, 255, 0, 1.079 > 262144, 0, 255, 1, 1.078 > 262144, 0, 256, 0, 0.988 > 262144, 0, 256, 1, 0.988 > 262144, 0, 4064, 0, 1.02 > 262144, 0, 4064, 1, 1.02 > 524295, 0, 0, 0, 0.692 > 524295, 0, 0, 1, 0.692 > 524303, 0, 3, 0, 0.736 > 524303, 0, 3, 1, 0.737 > 524319, 3, 0, 0, 0.758 > 524319, 3, 0, 1, 0.759 > 524351, 3, 5, 0, 0.759 > 524351, 3, 5, 1, 0.759 > 524288, 0, 127, 0, 1.057 > 524288, 0, 127, 1, 1.058 > 524288, 0, 255, 0, 1.079 > 524288, 0, 255, 1, 1.079 > 524288, 0, 256, 0, 0.988 > 524288, 0, 256, 1, 0.988 > 524288, 0, 4064, 0, 1.02 > 524288, 0, 4064, 1, 1.02 > 1048583, 0, 0, 0, 0.948 > 1048583, 0, 0, 1, 0.948 > 1048591, 0, 3, 0, 0.735 > 1048591, 0, 3, 1, 0.735 > 1048607, 3, 0, 0, 0.757 > 1048607, 3, 0, 1, 0.758 > 1048639, 3, 5, 0, 0.758 > 1048639, 3, 5, 1, 0.758 > 1048576, 0, 127, 0, 0.761 > 1048576, 0, 127, 1, 0.762 > 1048576, 0, 255, 0, 0.751 > 1048576, 0, 255, 1, 0.751 > 1048576, 0, 256, 0, 0.93 > 1048576, 0, 256, 1, 0.93 > 1048576, 0, 4064, 0, 0.93 > 1048576, 0, 4064, 1, 0.93 > 2097159, 0, 0, 0, 0.928 > 2097159, 0, 0, 1, 0.931 > 2097167, 0, 3, 0, 0.735 > 2097167, 0, 3, 1, 0.734 > 2097183, 3, 0, 0, 0.759 > 2097183, 3, 0, 1, 0.759 > 2097215, 3, 5, 0, 0.758 > 2097215, 3, 5, 1, 0.757 > 2097152, 0, 127, 0, 0.77 > 2097152, 0, 127, 1, 0.77 > 2097152, 0, 255, 0, 0.745 > 2097152, 0, 255, 1, 0.745 > 2097152, 0, 256, 0, 0.924 > 2097152, 0, 256, 1, 0.925 > 2097152, 0, 4064, 0, 0.926 > 2097152, 0, 4064, 1, 0.927 > 4194311, 0, 0, 0, 0.894 > 4194311, 0, 0, 1, 0.896 > 4194319, 0, 3, 0, 0.752 > 4194319, 0, 3, 1, 0.751 > 4194335, 3, 0, 0, 0.82 > 4194335, 3, 0, 1, 0.821 > 4194367, 3, 5, 0, 0.788 > 4194367, 3, 5, 1, 0.789 > 4194304, 0, 127, 0, 0.801 > 4194304, 0, 127, 1, 0.801 > 4194304, 0, 255, 0, 0.802 > 4194304, 0, 255, 1, 0.804 > 4194304, 0, 256, 0, 0.873 > 4194304, 0, 256, 1, 0.868 > 4194304, 0, 4064, 0, 0.955 > 4194304, 0, 4064, 1, 0.954 > 8388615, 0, 0, 0, 0.885 > 8388615, 0, 0, 1, 0.886 > 8388623, 0, 3, 0, 0.769 > 8388623, 0, 3, 1, 0.769 > 8388639, 3, 0, 0, 0.87 > 8388639, 3, 0, 1, 0.87 > 8388671, 3, 5, 0, 0.811 > 8388671, 3, 5, 1, 0.814 > 8388608, 0, 127, 0, 0.83 > 8388608, 0, 127, 1, 0.83 > 8388608, 0, 255, 0, 0.857 > 8388608, 0, 255, 1, 0.857 > 8388608, 0, 256, 0, 0.851 > 8388608, 0, 256, 1, 0.848 > 8388608, 0, 4064, 0, 0.981 > 8388608, 0, 4064, 1, 0.981 > 16777223, 0, 0, 0, 0.885 > 16777223, 0, 0, 1, 0.886 > 16777231, 0, 3, 0, 0.769 > 16777231, 0, 3, 1, 0.768 > 16777247, 3, 0, 0, 0.87 > 16777247, 3, 0, 1, 0.87 > 16777279, 3, 5, 0, 0.811 > 16777279, 3, 5, 1, 0.814 > 16777216, 0, 127, 0, 0.831 > 16777216, 0, 127, 1, 0.83 > 16777216, 0, 255, 0, 0.857 > 16777216, 0, 255, 1, 0.857 > 16777216, 0, 256, 0, 0.852 > 16777216, 0, 256, 1, 0.848 > 16777216, 0, 4064, 0, 0.98 > 16777216, 0, 4064, 1, 0.981 > 33554439, 0, 0, 0, 0.885 > 33554439, 0, 0, 1, 0.886 > 33554447, 0, 3, 0, 0.768 > 33554447, 0, 3, 1, 0.768 > 33554463, 3, 0, 0, 0.871 > 33554463, 3, 0, 1, 0.87 > 33554495, 3, 5, 0, 0.811 > 33554495, 3, 5, 1, 0.814 > 33554432, 0, 127, 0, 0.831 > 33554432, 0, 127, 1, 0.831 > 33554432, 0, 255, 0, 0.858 > 33554432, 0, 255, 1, 0.857 > 33554432, 0, 256, 0, 0.852 > 33554432, 0, 256, 1, 0.848 > 33554432, 0, 4064, 0, 0.98 > 33554432, 0, 4064, 1, 0.981 > > > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - > sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - > sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - > sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- > sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - > 6 files changed, 3572 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 2b3c625ea2..5b02ec8de5 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -46,13 +46,11 @@ sysdep_routines += \ > stpcpy-evex \ > stpcpy-sse2 \ > stpcpy-sse2-unaligned \ > - stpcpy-ssse3 \ > stpncpy-avx2 \ > stpncpy-avx2-rtm \ > stpncpy-c \ > stpncpy-evex \ > stpncpy-sse2-unaligned \ > - stpncpy-ssse3 \ > strcasecmp_l-avx2 \ > strcasecmp_l-avx2-rtm \ > strcasecmp_l-evex \ > @@ -83,7 +81,6 @@ sysdep_routines += \ > strcpy-evex \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > - strcpy-ssse3 \ > strcspn-c \ > strcspn-sse2 \ > strlen-avx2 \ > @@ -110,7 +107,6 @@ sysdep_routines += \ > strncpy-c \ > strncpy-evex \ > strncpy-sse2-unaligned \ > - strncpy-ssse3 \ > strnlen-avx2 \ > strnlen-avx2-rtm \ > strnlen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 41a04621ad..49ce6860d0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ > IFUNC_IMPL (i, name, stpncpy, > - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), > - __stpncpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), > __stpncpy_avx2) > IFUNC_IMPL_ADD (array, i, stpncpy, > @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > IFUNC_IMPL (i, name, stpcpy, > - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), > - __stpcpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), > __stpcpy_avx2) > IFUNC_IMPL_ADD (array, i, stpcpy, > @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcpy_evex) > - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), > - __strcpy_ssse3) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) > > @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncpy_evex) > - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), > - __strncpy_ssse3) > IFUNC_IMPL_ADD (array, i, strncpy, 1, > __strncpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > deleted file mode 100644 > index d971c2da38..0000000000 > --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STPCPY > -#define STRCPY __stpcpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > deleted file mode 100644 > index 14ed16f6b5..0000000000 > --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY __stpncpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S > deleted file mode 100644 > index f617a535cf..0000000000 > --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S > +++ /dev/null > @@ -1,3550 +0,0 @@ > -/* strcpy with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > - > -# ifndef STRCPY > -# define STRCPY __strcpy_ssse3 > -# endif > - > - .section .text.ssse3,"ax",@progbits > -ENTRY (STRCPY) > - > - mov %rsi, %rcx > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > -# endif > - mov %rdi, %rdx > -# ifdef USE_AS_STRNCPY > - test %R8_LP, %R8_LP > - jz L(Exit0) > - cmp $8, %R8_LP > - jbe L(StrncpyExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - jb L(StrncpyExit15Bytes) > -# endif > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - je L(Exit16) > -# endif > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# endif > - > -# ifdef USE_AS_STRNCPY > - mov %rcx, %rsi > - sub $16, %r8 > - and $0xf, %rsi > - > -/* add 16 bytes rcx_offset to r8 */ > - > - add %rsi, %r8 > -# endif > - lea 16(%rcx), %rsi > - and $-16, %rsi > - pxor %xmm0, %xmm0 > - mov (%rcx), %r9 > - mov %r9, (%rdx) > - pcmpeqb (%rsi), %xmm0 > - mov 8(%rcx), %r9 > - mov %r9, 8(%rdx) > - > -/* convert byte mask in xmm0 to bit mask */ > - > - pmovmskb %xmm0, %rax > - sub %rcx, %rsi > - > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - mov %rdx, %rax > - lea 16(%rdx), %rdx > - and $-16, %rdx > - sub %rdx, %rax > - > -# ifdef USE_AS_STRNCPY > - add %rax, %rsi > - lea -1(%rsi), %rsi > - and $1<<31, %esi > - test %rsi, %rsi > - jnz L(ContinueCopy) > - lea 16(%r8), %r8 > - > -L(ContinueCopy): > -# endif > - sub %rax, %rcx > - mov %rcx, %rax > - and $0xf, %rax > - mov $0, %rsi > - > -/* case: rcx_offset == rdx_offset */ > - > - jz L(Align16Both) > - > - cmp $8, %rax > - jae L(ShlHigh8) > - cmp $1, %rax > - je L(Shl1) > - cmp $2, %rax > - je L(Shl2) > - cmp $3, %rax > - je L(Shl3) > - cmp $4, %rax > - je L(Shl4) > - cmp $5, %rax > - je L(Shl5) > - cmp $6, %rax > - je L(Shl6) > - jmp L(Shl7) > - > -L(ShlHigh8): > - je L(Shl8) > - cmp $9, %rax > - je L(Shl9) > - cmp $10, %rax > - je L(Shl10) > - cmp $11, %rax > - je L(Shl11) > - cmp $12, %rax > - je L(Shl12) > - cmp $13, %rax > - je L(Shl13) > - cmp $14, %rax > - je L(Shl14) > - jmp L(Shl15) > - > -L(Align16Both): > - movaps (%rcx), %xmm1 > - movaps 16(%rcx), %xmm2 > - movaps %xmm1, (%rdx) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm4 > - movaps %xmm3, (%rdx, %rsi) > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm1 > - movaps %xmm4, (%rdx, %rsi) > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm2 > - movaps %xmm1, (%rdx, %rsi) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm3, (%rdx, %rsi) > - mov %rcx, %rax > - lea 16(%rcx, %rsi), %rcx > - and $-0x40, %rcx > - sub %rcx, %rax > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - lea 112(%r8, %rax), %r8 > -# endif > - mov $-0x40, %rsi > - > - .p2align 4 > -L(Aligned64Loop): > - movaps (%rcx), %xmm2 > - movaps %xmm2, %xmm4 > - movaps 16(%rcx), %xmm5 > - movaps 32(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 48(%rcx), %xmm7 > - pminub %xmm5, %xmm2 > - pminub %xmm7, %xmm3 > - pminub %xmm2, %xmm3 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %rax > - lea 64(%rdx), %rdx > - lea 64(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeaveCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Aligned64Leave) > - movaps %xmm4, -64(%rdx) > - movaps %xmm5, -48(%rdx) > - movaps %xmm6, -32(%rdx) > - movaps %xmm7, -16(%rdx) > - jmp L(Aligned64Loop) > - > -L(Aligned64Leave): > -# ifdef USE_AS_STRNCPY > - lea 48(%r8), %r8 > -# endif > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm6, -32(%rdx) > - pcmpeqb %xmm7, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl1): > - movaps -1(%rcx), %xmm1 > - movaps 15(%rcx), %xmm2 > -L(Shl1Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 31(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -15(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -1(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl1LoopStart): > - movaps 15(%rcx), %xmm2 > - movaps 31(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 47(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 63(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $1, %xmm4, %xmm5 > - test %rax, %rax > - palignr $1, %xmm3, %xmm4 > - jnz L(Shl1Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave1) > -# endif > - palignr $1, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $1, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl1LoopStart) > - > -L(Shl1LoopExit): > - movdqu -1(%rcx), %xmm1 > - mov $15, %rsi > - movdqu %xmm1, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl2): > - movaps -2(%rcx), %xmm1 > - movaps 14(%rcx), %xmm2 > -L(Shl2Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 30(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -14(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -2(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl2LoopStart): > - movaps 14(%rcx), %xmm2 > - movaps 30(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 46(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 62(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $2, %xmm4, %xmm5 > - test %rax, %rax > - palignr $2, %xmm3, %xmm4 > - jnz L(Shl2Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave2) > -# endif > - palignr $2, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $2, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl2LoopStart) > - > -L(Shl2LoopExit): > - movdqu -2(%rcx), %xmm1 > - mov $14, %rsi > - movdqu %xmm1, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl3): > - movaps -3(%rcx), %xmm1 > - movaps 13(%rcx), %xmm2 > -L(Shl3Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 29(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -13(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -3(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl3LoopStart): > - movaps 13(%rcx), %xmm2 > - movaps 29(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 45(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 61(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $3, %xmm4, %xmm5 > - test %rax, %rax > - palignr $3, %xmm3, %xmm4 > - jnz L(Shl3Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave3) > -# endif > - palignr $3, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $3, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl3LoopStart) > - > -L(Shl3LoopExit): > - movdqu -3(%rcx), %xmm1 > - mov $13, %rsi > - movdqu %xmm1, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl4): > - movaps -4(%rcx), %xmm1 > - movaps 12(%rcx), %xmm2 > -L(Shl4Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 28(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -12(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -4(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl4LoopStart): > - movaps 12(%rcx), %xmm2 > - movaps 28(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 44(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 60(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $4, %xmm4, %xmm5 > - test %rax, %rax > - palignr $4, %xmm3, %xmm4 > - jnz L(Shl4Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave4) > -# endif > - palignr $4, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $4, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl4LoopStart) > - > -L(Shl4LoopExit): > - movdqu -4(%rcx), %xmm1 > - mov $12, %rsi > - movdqu %xmm1, -4(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl5): > - movaps -5(%rcx), %xmm1 > - movaps 11(%rcx), %xmm2 > -L(Shl5Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 27(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -11(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -5(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl5LoopStart): > - movaps 11(%rcx), %xmm2 > - movaps 27(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 43(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 59(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $5, %xmm4, %xmm5 > - test %rax, %rax > - palignr $5, %xmm3, %xmm4 > - jnz L(Shl5Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave5) > -# endif > - palignr $5, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $5, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl5LoopStart) > - > -L(Shl5LoopExit): > - movdqu -5(%rcx), %xmm1 > - mov $11, %rsi > - movdqu %xmm1, -5(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl6): > - movaps -6(%rcx), %xmm1 > - movaps 10(%rcx), %xmm2 > -L(Shl6Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 26(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -10(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -6(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl6LoopStart): > - movaps 10(%rcx), %xmm2 > - movaps 26(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 42(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 58(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $6, %xmm4, %xmm5 > - test %rax, %rax > - palignr $6, %xmm3, %xmm4 > - jnz L(Shl6Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave6) > -# endif > - palignr $6, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $6, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl6LoopStart) > - > -L(Shl6LoopExit): > - mov (%rcx), %r9 > - mov 6(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 6(%rdx) > - mov $10, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl7): > - movaps -7(%rcx), %xmm1 > - movaps 9(%rcx), %xmm2 > -L(Shl7Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 25(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -9(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -7(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl7LoopStart): > - movaps 9(%rcx), %xmm2 > - movaps 25(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 41(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 57(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $7, %xmm4, %xmm5 > - test %rax, %rax > - palignr $7, %xmm3, %xmm4 > - jnz L(Shl7Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave7) > -# endif > - palignr $7, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $7, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl7LoopStart) > - > -L(Shl7LoopExit): > - mov (%rcx), %r9 > - mov 5(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 5(%rdx) > - mov $9, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl8): > - movaps -8(%rcx), %xmm1 > - movaps 8(%rcx), %xmm2 > -L(Shl8Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 24(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -8(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -8(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl8LoopStart): > - movaps 8(%rcx), %xmm2 > - movaps 24(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 40(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 56(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $8, %xmm4, %xmm5 > - test %rax, %rax > - palignr $8, %xmm3, %xmm4 > - jnz L(Shl8Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave8) > -# endif > - palignr $8, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $8, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl8LoopStart) > - > -L(Shl8LoopExit): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl9): > - movaps -9(%rcx), %xmm1 > - movaps 7(%rcx), %xmm2 > -L(Shl9Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 23(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -7(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -9(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl9LoopStart): > - movaps 7(%rcx), %xmm2 > - movaps 23(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 39(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 55(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $9, %xmm4, %xmm5 > - test %rax, %rax > - palignr $9, %xmm3, %xmm4 > - jnz L(Shl9Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave9) > -# endif > - palignr $9, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $9, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl9LoopStart) > - > -L(Shl9LoopExit): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl10): > - movaps -10(%rcx), %xmm1 > - movaps 6(%rcx), %xmm2 > -L(Shl10Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 22(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -6(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -10(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl10LoopStart): > - movaps 6(%rcx), %xmm2 > - movaps 22(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 38(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 54(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $10, %xmm4, %xmm5 > - test %rax, %rax > - palignr $10, %xmm3, %xmm4 > - jnz L(Shl10Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave10) > -# endif > - palignr $10, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $10, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl10LoopStart) > - > -L(Shl10LoopExit): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl11): > - movaps -11(%rcx), %xmm1 > - movaps 5(%rcx), %xmm2 > -L(Shl11Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 21(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -5(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -11(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl11LoopStart): > - movaps 5(%rcx), %xmm2 > - movaps 21(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 37(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 53(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $11, %xmm4, %xmm5 > - test %rax, %rax > - palignr $11, %xmm3, %xmm4 > - jnz L(Shl11Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave11) > -# endif > - palignr $11, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $11, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl11LoopStart) > - > -L(Shl11LoopExit): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl12): > - movaps -12(%rcx), %xmm1 > - movaps 4(%rcx), %xmm2 > -L(Shl12Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 20(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -4(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -12(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl12LoopStart): > - movaps 4(%rcx), %xmm2 > - movaps 20(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 36(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 52(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $12, %xmm4, %xmm5 > - test %rax, %rax > - palignr $12, %xmm3, %xmm4 > - jnz L(Shl12Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave12) > -# endif > - palignr $12, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $12, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl12LoopStart) > - > -L(Shl12LoopExit): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl13): > - movaps -13(%rcx), %xmm1 > - movaps 3(%rcx), %xmm2 > -L(Shl13Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 19(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -3(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -13(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl13LoopStart): > - movaps 3(%rcx), %xmm2 > - movaps 19(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 35(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 51(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $13, %xmm4, %xmm5 > - test %rax, %rax > - palignr $13, %xmm3, %xmm4 > - jnz L(Shl13Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave13) > -# endif > - palignr $13, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $13, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl13LoopStart) > - > -L(Shl13LoopExit): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl14): > - movaps -14(%rcx), %xmm1 > - movaps 2(%rcx), %xmm2 > -L(Shl14Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 18(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -2(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -14(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl14LoopStart): > - movaps 2(%rcx), %xmm2 > - movaps 18(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 34(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 50(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $14, %xmm4, %xmm5 > - test %rax, %rax > - palignr $14, %xmm3, %xmm4 > - jnz L(Shl14Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave14) > -# endif > - palignr $14, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $14, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl14LoopStart) > - > -L(Shl14LoopExit): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl15): > - movaps -15(%rcx), %xmm1 > - movaps 1(%rcx), %xmm2 > -L(Shl15Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 17(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -1(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -15(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl15LoopStart): > - movaps 1(%rcx), %xmm2 > - movaps 17(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 33(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 49(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $15, %xmm4, %xmm5 > - test %rax, %rax > - palignr $15, %xmm3, %xmm4 > - jnz L(Shl15Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave15) > -# endif > - palignr $15, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $15, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl15LoopStart) > - > -L(Shl15LoopExit): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > -# ifdef USE_AS_STRCAT > - jmp L(CopyFrom1To16Bytes) > -# endif > - > -# ifndef USE_AS_STRCAT > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > -# ifdef USE_AS_STRNCPY > - add $16, %r8 > -# endif > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - > - .p2align 4 > -L(Exit8): > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $8, %r8 > - lea 8(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - > - .p2align 4 > -L(Exit16): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %rax > - mov %rax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 15(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - lea 16(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - cmp $1, %r8 > - je L(Exit1) > - test $0x01, %al > - jnz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - test $0x02, %al > - jnz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - test $0x04, %al > - jnz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - test $0x08, %al > - jnz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - test $0x10, %al > - jnz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - test $0x20, %al > - jnz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - test $0x40, %al > - jnz L(Exit7) > - jmp L(Exit8) > - > - .p2align 4 > -L(ExitHighCase2): > - cmp $9, %r8 > - je L(Exit9) > - test $0x01, %ah > - jnz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $15, %r8 > - je L(Exit15) > - test $0x40, %ah > - jnz L(Exit15) > - jmp L(Exit16) > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $16, %r8 > - je L(Exit16) > - cmp $8, %r8 > - je L(Exit8) > - jg L(More8Case3) > - cmp $4, %r8 > - je L(Exit4) > - jg L(More4Case3) > - cmp $2, %r8 > - jl L(Exit1) > - je L(Exit2) > - jg L(Exit3) > -L(More8Case3): /* but less than 16 */ > - cmp $12, %r8 > - je L(Exit12) > - jl L(Less12Case3) > - cmp $14, %r8 > - jl L(Exit13) > - je L(Exit14) > - jg L(Exit15) > -L(More4Case3): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Exit5) > - je L(Exit6) > - jg L(Exit7) > -L(Less12Case3): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Exit9) > - je L(Exit10) > - jg L(Exit11) > -# endif > - > - .p2align 4 > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $1, %r8 > - lea 1(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 1(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $2, %r8 > - lea 2(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > -# ifdef USE_AS_STPCPY > - lea 2(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $3, %r8 > - lea 3(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit4): > - movl (%rcx), %eax > - movl %eax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 3(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $4, %r8 > - lea 4(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit5): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 4(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $5, %r8 > - lea 5(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit6): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 5(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $6, %r8 > - lea 6(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit7): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movl 3(%rcx), %eax > - movl %eax, 3(%rdx) > -# ifdef USE_AS_STPCPY > - lea 6(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $7, %r8 > - lea 7(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit9): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %eax > - mov %eax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 8(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $9, %r8 > - lea 9(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit10): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %eax > - mov %eax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 9(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $10, %r8 > - lea 10(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit11): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 10(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $11, %r8 > - lea 11(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit12): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 11(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $12, %r8 > - lea 12(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit13): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %rax > - mov %rax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 12(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $13, %r8 > - lea 13(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit14): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %rax > - mov %rax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 13(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $14, %r8 > - lea 14(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit15): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $15, %r8 > - lea 15(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(Fill0): > - ret > - > - .p2align 4 > -L(Fill1): > - movb %dl, (%rcx) > - ret > - > - .p2align 4 > -L(Fill2): > - movw %dx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill3): > - movw %dx, (%rcx) > - movb %dl, 2(%rcx) > - ret > - > - .p2align 4 > -L(Fill4): > - movl %edx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill5): > - movl %edx, (%rcx) > - movb %dl, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill6): > - movl %edx, (%rcx) > - movw %dx, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill7): > - movl %edx, (%rcx) > - movl %edx, 3(%rcx) > - ret > - > - .p2align 4 > -L(Fill8): > - mov %rdx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill9): > - mov %rdx, (%rcx) > - movb %dl, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill10): > - mov %rdx, (%rcx) > - movw %dx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill11): > - mov %rdx, (%rcx) > - movl %edx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill12): > - mov %rdx, (%rcx) > - movl %edx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill13): > - mov %rdx, (%rcx) > - mov %rdx, 5(%rcx) > - ret > - > - .p2align 4 > -L(Fill14): > - mov %rdx, (%rcx) > - mov %rdx, 6(%rcx) > - ret > - > - .p2align 4 > -L(Fill15): > - mov %rdx, (%rcx) > - mov %rdx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill16): > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - ret > - > - .p2align 4 > -L(StrncpyFillExit1): > - lea 16(%r8), %r8 > -L(FillFrom1To16Bytes): > - test %r8, %r8 > - jz L(Fill0) > - cmp $16, %r8 > - je L(Fill16) > - cmp $8, %r8 > - je L(Fill8) > - jg L(FillMore8) > - cmp $4, %r8 > - je L(Fill4) > - jg L(FillMore4) > - cmp $2, %r8 > - jl L(Fill1) > - je L(Fill2) > - jg L(Fill3) > -L(FillMore8): /* but less than 16 */ > - cmp $12, %r8 > - je L(Fill12) > - jl L(FillLess12) > - cmp $14, %r8 > - jl L(Fill13) > - je L(Fill14) > - jg L(Fill15) > -L(FillMore4): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Fill5) > - je L(Fill6) > - jg L(Fill7) > -L(FillLess12): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Fill9) > - je L(Fill10) > - jmp L(Fill11) > - > - .p2align 4 > -L(StrncpyFillTailWithZero1): > - xor %rdx, %rdx > - sub $16, %r8 > - jbe L(StrncpyFillExit1) > - > - pxor %xmm0, %xmm0 > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - > - lea 16(%rcx), %rcx > - > - mov %rcx, %rdx > - and $0xf, %rdx > - sub %rdx, %rcx > - add %rdx, %r8 > - xor %rdx, %rdx > - sub $64, %r8 > - jb L(StrncpyFillLess64) > - > -L(StrncpyFillLoopMovdqa): > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - movdqa %xmm0, 32(%rcx) > - movdqa %xmm0, 48(%rcx) > - lea 64(%rcx), %rcx > - sub $64, %r8 > - jae L(StrncpyFillLoopMovdqa) > - > -L(StrncpyFillLess64): > - add $32, %r8 > - jl L(StrncpyFillLess32) > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - lea 32(%rcx), %rcx > - sub $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > -L(StrncpyFillLess32): > - add $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > - .p2align 4 > -L(Exit0): > - mov %rdx, %rax > - ret > - > - .p2align 4 > -L(StrncpyExit15Bytes): > - cmp $9, %r8 > - je L(Exit9) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > - .p2align 4 > -L(StrncpyExit8Bytes): > - cmp $1, %r8 > - je L(Exit1) > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > -# endif > -# endif > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(StrncpyLeaveCase2OrCase3): > - test %rax, %rax > - jnz L(Aligned64LeaveCase2) > - > -L(Aligned64LeaveCase3): > - lea 64(%r8), %r8 > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase3) > - > -L(Aligned64LeaveCase2): > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - add $48, %r8 > - jle L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase2) > -/*--------------------------------------------------*/ > - .p2align 4 > -L(StrncpyExit1Case2OrCase3): > - movdqu -1(%rcx), %xmm0 > - movdqu %xmm0, -1(%rdx) > - mov $15, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit2Case2OrCase3): > - movdqu -2(%rcx), %xmm0 > - movdqu %xmm0, -2(%rdx) > - mov $14, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit3Case2OrCase3): > - movdqu -3(%rcx), %xmm0 > - movdqu %xmm0, -3(%rdx) > - mov $13, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit4Case2OrCase3): > - movdqu -4(%rcx), %xmm0 > - movdqu %xmm0, -4(%rdx) > - mov $12, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit5Case2OrCase3): > - movdqu -5(%rcx), %xmm0 > - movdqu %xmm0, -5(%rdx) > - mov $11, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit6Case2OrCase3): > - mov (%rcx), %rsi > - mov 6(%rcx), %r9d > - mov %r9d, 6(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $10, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit7Case2OrCase3): > - mov (%rcx), %rsi > - mov 5(%rcx), %r9d > - mov %r9d, 5(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $9, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit8Case2OrCase3): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit9Case2OrCase3): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit10Case2OrCase3): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit11Case2OrCase3): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit12Case2OrCase3): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit13Case2OrCase3): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit14Case2OrCase3): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit15Case2OrCase3): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave1): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit1) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit1): > - lea 15(%rdx, %rsi), %rdx > - lea 15(%rcx, %rsi), %rcx > - mov -15(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -15(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave2): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit2) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit2): > - lea 14(%rdx, %rsi), %rdx > - lea 14(%rcx, %rsi), %rcx > - mov -14(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -14(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave3): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit3) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit3): > - lea 13(%rdx, %rsi), %rdx > - lea 13(%rcx, %rsi), %rcx > - mov -13(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -13(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave4): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit4) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit4): > - lea 12(%rdx, %rsi), %rdx > - lea 12(%rcx, %rsi), %rcx > - mov -12(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -12(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave5): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit5) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit5): > - lea 11(%rdx, %rsi), %rdx > - lea 11(%rcx, %rsi), %rcx > - mov -11(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -11(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave6): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit6) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit6): > - lea 10(%rdx, %rsi), %rdx > - lea 10(%rcx, %rsi), %rcx > - mov -10(%rcx), %rsi > - movw -2(%rcx), %ax > - mov %rsi, -10(%rdx) > - movw %ax, -2(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave7): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit7) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit7): > - lea 9(%rdx, %rsi), %rdx > - lea 9(%rcx, %rsi), %rcx > - mov -9(%rcx), %rsi > - movb -1(%rcx), %ah > - mov %rsi, -9(%rdx) > - movb %ah, -1(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave8): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit8) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit8): > - lea 8(%rdx, %rsi), %rdx > - lea 8(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave9): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit9) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit9): > - lea 7(%rdx, %rsi), %rdx > - lea 7(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave10): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit10) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit10): > - lea 6(%rdx, %rsi), %rdx > - lea 6(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave11): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit11) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit11): > - lea 5(%rdx, %rsi), %rdx > - lea 5(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave12): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit12) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit12): > - lea 4(%rdx, %rsi), %rdx > - lea 4(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave13): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit13) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit13): > - lea 3(%rdx, %rsi), %rdx > - lea 3(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave14): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit14) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit14): > - lea 2(%rdx, %rsi), %rdx > - lea 2(%rcx, %rsi), %rcx > - movw -2(%rcx), %ax > - xor %rsi, %rsi > - movw %ax, -2(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave15): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit15) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit15): > - lea 1(%rdx, %rsi), %rdx > - lea 1(%rcx, %rsi), %rcx > - movb -1(%rcx), %ah > - xor %rsi, %rsi > - movb %ah, -1(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > -# endif > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# endif > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S > deleted file mode 100644 > index bf82ee447d..0000000000 > --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCPY > -#define STRCPY __strncpy_ssse3 > -#include "strcpy-ssse3.S" > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (2 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein ` (5 subsequent siblings) 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +- sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - 5 files changed, 6 insertions(+), 3212 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5b02ec8de5..303fb5d734 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,6 @@ sysdep_routines += \ memcmpeq-evex \ memcmpeq-sse2 \ memcpy-ssse3 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -25,7 +24,6 @@ sysdep_routines += \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ memmove-ssse3 \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 49ce6860d0..c6008a73ed 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3) @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3) @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3) @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..fb01fbb301 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + return OPTIMIZE (ssse3); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) -L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx -L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy_fwd): - sub $0x80, %rdx -L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx -L(ll_cache_copy_fwd): - add %rcx, %rdx -L(ll_cache_copy_fwd_start): - sub $0x80, %rdx -L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) -L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx -L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy): - sub $0x80, %rdx -L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx -L(ll_cache_copy): - add %rcx, %rdx -L(ll_cache_copy_bwd_start): - sub $0x80, %rdx -L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) -L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -L(fwd_write_0bytes): - ret - - - .p2align 4 -L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -L(bwd_write_0bytes): - ret - - .p2align 4 -L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - - .p2align 4 -L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - - .p2align 3 -L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - - .p2align 3 -L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S deleted file mode 100644 index f9a4e9aff9..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_back -#define MEMCPY_CHK __memmove_chk_ssse3_back -#include "memcpy-ssse3-back.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back 2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - > sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +- > sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- > sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - > 5 files changed, 6 insertions(+), 3212 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 5b02ec8de5..303fb5d734 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -17,7 +17,6 @@ sysdep_routines += \ > memcmpeq-evex \ > memcmpeq-sse2 \ > memcpy-ssse3 \ > - memcpy-ssse3-back \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > @@ -25,7 +24,6 @@ sysdep_routines += \ > memmove-evex-unaligned-erms \ > memmove-sse2-unaligned-erms \ > memmove-ssse3 \ > - memmove-ssse3-back \ > memrchr-avx2 \ > memrchr-avx2-rtm \ > memrchr-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 49ce6860d0..c6008a73ed 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memmove_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memmove_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (SSSE3), > __memmove_chk_ssse3) > @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_avx512_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > - __memmove_ssse3_back) > IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > __memmove_ssse3) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) > @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __memcpy_chk_ssse3) > @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > - __memcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > __memcpy_ssse3) > IFUNC_IMPL_ADD (array, i, memcpy, > @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __mempcpy_chk_ssse3) > @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, mempcpy, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > __mempcpy_ssse3) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h > index f8f958064c..fb01fbb301 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h > @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) > attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) > attribute_hidden; > @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) > } > } > > - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > - return OPTIMIZE (sse2_unaligned_erms); > - > - return OPTIMIZE (sse2_unaligned); > + return OPTIMIZE (ssse3); > } > > - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) > - return OPTIMIZE (ssse3_back); > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE (sse2_unaligned_erms); > > - return OPTIMIZE (ssse3); > + return OPTIMIZE (sse2_unaligned); > } > diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > deleted file mode 100644 > index 92cfbf7933..0000000000 > --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > +++ /dev/null > @@ -1,3181 +0,0 @@ > -/* memcpy with SSSE3 and REP string > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#if IS_IN (libc) > - > -#include "asm-syntax.h" > - > -#ifndef MEMCPY > -# define MEMCPY __memcpy_ssse3_back > -# define MEMCPY_CHK __memcpy_chk_ssse3_back > -# define MEMPCPY __mempcpy_ssse3_back > -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back > -#endif > - > -#define JMPTBL(I, B) I - B > - > -/* Branch to an entry in a jump table. TABLE is a jump table with > - relative offsets. INDEX is a register contains the index into the > - jump table. SCALE is the scale of INDEX. */ > -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), INDEX; \ > - lea (%r11, INDEX), INDEX; \ > - _CET_NOTRACK jmp *INDEX; \ > - ud2 > - > - .section .text.ssse3,"ax",@progbits > -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE > -ENTRY (MEMPCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMPCPY_CHK) > - > -ENTRY (MEMPCPY) > - mov %RDI_LP, %RAX_LP > - add %RDX_LP, %RAX_LP > - jmp L(start) > -END (MEMPCPY) > -#endif > - > -#if !defined USE_AS_BCOPY > -ENTRY (MEMCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMCPY_CHK) > -#endif > - > -ENTRY (MEMCPY) > - mov %RDI_LP, %RAX_LP > -#ifdef USE_AS_MEMPCPY > - add %RDX_LP, %RAX_LP > -#endif > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -#endif > - > -#ifdef USE_AS_MEMMOVE > - cmp %rsi, %rdi > - jb L(copy_forward) > - je L(bwd_write_0bytes) > - cmp $144, %rdx > - jae L(copy_backward) > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -L(copy_forward): > -#endif > -L(start): > - cmp $144, %rdx > - jae L(144bytesormore) > - > -L(fwd_write_less32bytes): > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jbe L(bk_write) > -#endif > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > -#ifndef USE_AS_MEMMOVE > -L(bk_write): > - > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -#endif > - > - .p2align 4 > -L(144bytesormore): > - > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jle L(copy_backward) > -#endif > - movdqu (%rsi), %xmm0 > - mov %rdi, %r8 > - and $-16, %rdi > - add $16, %rdi > - mov %rdi, %r9 > - sub %r8, %r9 > - sub %r9, %rdx > - add %r9, %rsi > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0) > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - cmp %rcx, %rdx > - jae L(gobble_mem_fwd) > - lea L(shl_table_fwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(copy_backward): > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - shl $1, %rcx > - cmp %rcx, %rdx > - ja L(gobble_mem_bwd) > - > - add %rdx, %rdi > - add %rdx, %rsi > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $0xf, %r9 > - xor %r9, %rdi > - sub %r9, %rsi > - sub %r9, %rdx > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0_bwd) > - lea L(shl_table_bwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(shl_0): > - > - mov %rdx, %r9 > - shr $8, %r9 > - add %rdx, %r9 > -#ifdef DATA_CACHE_SIZE > - cmp $DATA_CACHE_SIZE_HALF, %R9_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %R9_LP > -#endif > - jae L(gobble_mem_fwd) > - sub $0x80, %rdx > - .p2align 4 > -L(shl_0_loop): > - movdqa (%rsi), %xmm1 > - movdqa %xmm1, (%rdi) > - movaps 0x10(%rsi), %xmm2 > - movaps %xmm2, 0x10(%rdi) > - movaps 0x20(%rsi), %xmm3 > - movaps %xmm3, 0x20(%rdi) > - movaps 0x30(%rsi), %xmm4 > - movaps %xmm4, 0x30(%rdi) > - movaps 0x40(%rsi), %xmm1 > - movaps %xmm1, 0x40(%rdi) > - movaps 0x50(%rsi), %xmm2 > - movaps %xmm2, 0x50(%rdi) > - movaps 0x60(%rsi), %xmm3 > - movaps %xmm3, 0x60(%rdi) > - movaps 0x70(%rsi), %xmm4 > - movaps %xmm4, 0x70(%rdi) > - sub $0x80, %rdx > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(shl_0_loop) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_0_bwd): > - sub $0x80, %rdx > -L(copy_backward_loop): > - movaps -0x10(%rsi), %xmm1 > - movaps %xmm1, -0x10(%rdi) > - movaps -0x20(%rsi), %xmm2 > - movaps %xmm2, -0x20(%rdi) > - movaps -0x30(%rsi), %xmm3 > - movaps %xmm3, -0x30(%rdi) > - movaps -0x40(%rsi), %xmm4 > - movaps %xmm4, -0x40(%rdi) > - movaps -0x50(%rsi), %xmm5 > - movaps %xmm5, -0x50(%rdi) > - movaps -0x60(%rsi), %xmm5 > - movaps %xmm5, -0x60(%rdi) > - movaps -0x70(%rsi), %xmm5 > - movaps %xmm5, -0x70(%rdi) > - movaps -0x80(%rsi), %xmm5 > - movaps %xmm5, -0x80(%rdi) > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(copy_backward_loop) > - > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1): > - sub $0x80, %rdx > - movaps -0x01(%rsi), %xmm1 > - movaps 0x0f(%rsi), %xmm2 > - movaps 0x1f(%rsi), %xmm3 > - movaps 0x2f(%rsi), %xmm4 > - movaps 0x3f(%rsi), %xmm5 > - movaps 0x4f(%rsi), %xmm6 > - movaps 0x5f(%rsi), %xmm7 > - movaps 0x6f(%rsi), %xmm8 > - movaps 0x7f(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $1, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $1, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $1, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $1, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $1, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $1, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $1, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_1) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1_bwd): > - movaps -0x01(%rsi), %xmm1 > - > - movaps -0x11(%rsi), %xmm2 > - palignr $1, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x21(%rsi), %xmm3 > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x31(%rsi), %xmm4 > - palignr $1, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x41(%rsi), %xmm5 > - palignr $1, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x51(%rsi), %xmm6 > - palignr $1, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x61(%rsi), %xmm7 > - palignr $1, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x71(%rsi), %xmm8 > - palignr $1, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x81(%rsi), %xmm9 > - palignr $1, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_1_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2): > - sub $0x80, %rdx > - movaps -0x02(%rsi), %xmm1 > - movaps 0x0e(%rsi), %xmm2 > - movaps 0x1e(%rsi), %xmm3 > - movaps 0x2e(%rsi), %xmm4 > - movaps 0x3e(%rsi), %xmm5 > - movaps 0x4e(%rsi), %xmm6 > - movaps 0x5e(%rsi), %xmm7 > - movaps 0x6e(%rsi), %xmm8 > - movaps 0x7e(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $2, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $2, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $2, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $2, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $2, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $2, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $2, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_2) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2_bwd): > - movaps -0x02(%rsi), %xmm1 > - > - movaps -0x12(%rsi), %xmm2 > - palignr $2, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x22(%rsi), %xmm3 > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x32(%rsi), %xmm4 > - palignr $2, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x42(%rsi), %xmm5 > - palignr $2, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x52(%rsi), %xmm6 > - palignr $2, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x62(%rsi), %xmm7 > - palignr $2, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x72(%rsi), %xmm8 > - palignr $2, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x82(%rsi), %xmm9 > - palignr $2, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_2_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3): > - sub $0x80, %rdx > - movaps -0x03(%rsi), %xmm1 > - movaps 0x0d(%rsi), %xmm2 > - movaps 0x1d(%rsi), %xmm3 > - movaps 0x2d(%rsi), %xmm4 > - movaps 0x3d(%rsi), %xmm5 > - movaps 0x4d(%rsi), %xmm6 > - movaps 0x5d(%rsi), %xmm7 > - movaps 0x6d(%rsi), %xmm8 > - movaps 0x7d(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $3, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $3, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $3, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $3, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $3, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $3, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $3, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_3) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3_bwd): > - movaps -0x03(%rsi), %xmm1 > - > - movaps -0x13(%rsi), %xmm2 > - palignr $3, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x23(%rsi), %xmm3 > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x33(%rsi), %xmm4 > - palignr $3, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x43(%rsi), %xmm5 > - palignr $3, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x53(%rsi), %xmm6 > - palignr $3, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x63(%rsi), %xmm7 > - palignr $3, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x73(%rsi), %xmm8 > - palignr $3, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x83(%rsi), %xmm9 > - palignr $3, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_3_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4): > - sub $0x80, %rdx > - movaps -0x04(%rsi), %xmm1 > - movaps 0x0c(%rsi), %xmm2 > - movaps 0x1c(%rsi), %xmm3 > - movaps 0x2c(%rsi), %xmm4 > - movaps 0x3c(%rsi), %xmm5 > - movaps 0x4c(%rsi), %xmm6 > - movaps 0x5c(%rsi), %xmm7 > - movaps 0x6c(%rsi), %xmm8 > - movaps 0x7c(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $4, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $4, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $4, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $4, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $4, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $4, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $4, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_4) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4_bwd): > - movaps -0x04(%rsi), %xmm1 > - > - movaps -0x14(%rsi), %xmm2 > - palignr $4, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x24(%rsi), %xmm3 > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x34(%rsi), %xmm4 > - palignr $4, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x44(%rsi), %xmm5 > - palignr $4, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x54(%rsi), %xmm6 > - palignr $4, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x64(%rsi), %xmm7 > - palignr $4, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x74(%rsi), %xmm8 > - palignr $4, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x84(%rsi), %xmm9 > - palignr $4, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_4_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5): > - sub $0x80, %rdx > - movaps -0x05(%rsi), %xmm1 > - movaps 0x0b(%rsi), %xmm2 > - movaps 0x1b(%rsi), %xmm3 > - movaps 0x2b(%rsi), %xmm4 > - movaps 0x3b(%rsi), %xmm5 > - movaps 0x4b(%rsi), %xmm6 > - movaps 0x5b(%rsi), %xmm7 > - movaps 0x6b(%rsi), %xmm8 > - movaps 0x7b(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $5, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $5, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $5, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $5, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $5, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $5, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $5, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_5) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5_bwd): > - movaps -0x05(%rsi), %xmm1 > - > - movaps -0x15(%rsi), %xmm2 > - palignr $5, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x25(%rsi), %xmm3 > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x35(%rsi), %xmm4 > - palignr $5, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x45(%rsi), %xmm5 > - palignr $5, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x55(%rsi), %xmm6 > - palignr $5, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x65(%rsi), %xmm7 > - palignr $5, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x75(%rsi), %xmm8 > - palignr $5, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x85(%rsi), %xmm9 > - palignr $5, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_5_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6): > - sub $0x80, %rdx > - movaps -0x06(%rsi), %xmm1 > - movaps 0x0a(%rsi), %xmm2 > - movaps 0x1a(%rsi), %xmm3 > - movaps 0x2a(%rsi), %xmm4 > - movaps 0x3a(%rsi), %xmm5 > - movaps 0x4a(%rsi), %xmm6 > - movaps 0x5a(%rsi), %xmm7 > - movaps 0x6a(%rsi), %xmm8 > - movaps 0x7a(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $6, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $6, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $6, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $6, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $6, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $6, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $6, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_6) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6_bwd): > - movaps -0x06(%rsi), %xmm1 > - > - movaps -0x16(%rsi), %xmm2 > - palignr $6, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x26(%rsi), %xmm3 > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x36(%rsi), %xmm4 > - palignr $6, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x46(%rsi), %xmm5 > - palignr $6, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x56(%rsi), %xmm6 > - palignr $6, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x66(%rsi), %xmm7 > - palignr $6, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x76(%rsi), %xmm8 > - palignr $6, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x86(%rsi), %xmm9 > - palignr $6, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_6_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7): > - sub $0x80, %rdx > - movaps -0x07(%rsi), %xmm1 > - movaps 0x09(%rsi), %xmm2 > - movaps 0x19(%rsi), %xmm3 > - movaps 0x29(%rsi), %xmm4 > - movaps 0x39(%rsi), %xmm5 > - movaps 0x49(%rsi), %xmm6 > - movaps 0x59(%rsi), %xmm7 > - movaps 0x69(%rsi), %xmm8 > - movaps 0x79(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $7, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $7, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $7, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $7, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $7, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $7, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $7, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_7) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7_bwd): > - movaps -0x07(%rsi), %xmm1 > - > - movaps -0x17(%rsi), %xmm2 > - palignr $7, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x27(%rsi), %xmm3 > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x37(%rsi), %xmm4 > - palignr $7, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x47(%rsi), %xmm5 > - palignr $7, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x57(%rsi), %xmm6 > - palignr $7, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x67(%rsi), %xmm7 > - palignr $7, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x77(%rsi), %xmm8 > - palignr $7, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x87(%rsi), %xmm9 > - palignr $7, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_7_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8): > - sub $0x80, %rdx > - movaps -0x08(%rsi), %xmm1 > - movaps 0x08(%rsi), %xmm2 > - movaps 0x18(%rsi), %xmm3 > - movaps 0x28(%rsi), %xmm4 > - movaps 0x38(%rsi), %xmm5 > - movaps 0x48(%rsi), %xmm6 > - movaps 0x58(%rsi), %xmm7 > - movaps 0x68(%rsi), %xmm8 > - movaps 0x78(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $8, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $8, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $8, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $8, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $8, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $8, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $8, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_8) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8_bwd): > - movaps -0x08(%rsi), %xmm1 > - > - movaps -0x18(%rsi), %xmm2 > - palignr $8, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x28(%rsi), %xmm3 > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x38(%rsi), %xmm4 > - palignr $8, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x48(%rsi), %xmm5 > - palignr $8, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x58(%rsi), %xmm6 > - palignr $8, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x68(%rsi), %xmm7 > - palignr $8, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x78(%rsi), %xmm8 > - palignr $8, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x88(%rsi), %xmm9 > - palignr $8, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_8_bwd) > -L(shl_8_end_bwd): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9): > - sub $0x80, %rdx > - movaps -0x09(%rsi), %xmm1 > - movaps 0x07(%rsi), %xmm2 > - movaps 0x17(%rsi), %xmm3 > - movaps 0x27(%rsi), %xmm4 > - movaps 0x37(%rsi), %xmm5 > - movaps 0x47(%rsi), %xmm6 > - movaps 0x57(%rsi), %xmm7 > - movaps 0x67(%rsi), %xmm8 > - movaps 0x77(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $9, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $9, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $9, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $9, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $9, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $9, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $9, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_9) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9_bwd): > - movaps -0x09(%rsi), %xmm1 > - > - movaps -0x19(%rsi), %xmm2 > - palignr $9, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x29(%rsi), %xmm3 > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x39(%rsi), %xmm4 > - palignr $9, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x49(%rsi), %xmm5 > - palignr $9, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x59(%rsi), %xmm6 > - palignr $9, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x69(%rsi), %xmm7 > - palignr $9, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x79(%rsi), %xmm8 > - palignr $9, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x89(%rsi), %xmm9 > - palignr $9, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_9_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10): > - sub $0x80, %rdx > - movaps -0x0a(%rsi), %xmm1 > - movaps 0x06(%rsi), %xmm2 > - movaps 0x16(%rsi), %xmm3 > - movaps 0x26(%rsi), %xmm4 > - movaps 0x36(%rsi), %xmm5 > - movaps 0x46(%rsi), %xmm6 > - movaps 0x56(%rsi), %xmm7 > - movaps 0x66(%rsi), %xmm8 > - movaps 0x76(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $10, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $10, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $10, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $10, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $10, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $10, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $10, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_10) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10_bwd): > - movaps -0x0a(%rsi), %xmm1 > - > - movaps -0x1a(%rsi), %xmm2 > - palignr $10, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2a(%rsi), %xmm3 > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3a(%rsi), %xmm4 > - palignr $10, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4a(%rsi), %xmm5 > - palignr $10, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5a(%rsi), %xmm6 > - palignr $10, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6a(%rsi), %xmm7 > - palignr $10, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7a(%rsi), %xmm8 > - palignr $10, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8a(%rsi), %xmm9 > - palignr $10, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_10_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11): > - sub $0x80, %rdx > - movaps -0x0b(%rsi), %xmm1 > - movaps 0x05(%rsi), %xmm2 > - movaps 0x15(%rsi), %xmm3 > - movaps 0x25(%rsi), %xmm4 > - movaps 0x35(%rsi), %xmm5 > - movaps 0x45(%rsi), %xmm6 > - movaps 0x55(%rsi), %xmm7 > - movaps 0x65(%rsi), %xmm8 > - movaps 0x75(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $11, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $11, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $11, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $11, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $11, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $11, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $11, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_11) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11_bwd): > - movaps -0x0b(%rsi), %xmm1 > - > - movaps -0x1b(%rsi), %xmm2 > - palignr $11, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2b(%rsi), %xmm3 > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3b(%rsi), %xmm4 > - palignr $11, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4b(%rsi), %xmm5 > - palignr $11, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5b(%rsi), %xmm6 > - palignr $11, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6b(%rsi), %xmm7 > - palignr $11, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7b(%rsi), %xmm8 > - palignr $11, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8b(%rsi), %xmm9 > - palignr $11, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_11_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12): > - sub $0x80, %rdx > - movdqa -0x0c(%rsi), %xmm1 > - movaps 0x04(%rsi), %xmm2 > - movaps 0x14(%rsi), %xmm3 > - movaps 0x24(%rsi), %xmm4 > - movaps 0x34(%rsi), %xmm5 > - movaps 0x44(%rsi), %xmm6 > - movaps 0x54(%rsi), %xmm7 > - movaps 0x64(%rsi), %xmm8 > - movaps 0x74(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $12, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $12, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $12, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $12, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $12, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $12, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $12, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - > - lea 0x80(%rdi), %rdi > - jae L(shl_12) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12_bwd): > - movaps -0x0c(%rsi), %xmm1 > - > - movaps -0x1c(%rsi), %xmm2 > - palignr $12, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2c(%rsi), %xmm3 > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3c(%rsi), %xmm4 > - palignr $12, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4c(%rsi), %xmm5 > - palignr $12, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5c(%rsi), %xmm6 > - palignr $12, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6c(%rsi), %xmm7 > - palignr $12, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7c(%rsi), %xmm8 > - palignr $12, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8c(%rsi), %xmm9 > - palignr $12, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_12_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13): > - sub $0x80, %rdx > - movaps -0x0d(%rsi), %xmm1 > - movaps 0x03(%rsi), %xmm2 > - movaps 0x13(%rsi), %xmm3 > - movaps 0x23(%rsi), %xmm4 > - movaps 0x33(%rsi), %xmm5 > - movaps 0x43(%rsi), %xmm6 > - movaps 0x53(%rsi), %xmm7 > - movaps 0x63(%rsi), %xmm8 > - movaps 0x73(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $13, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $13, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $13, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $13, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $13, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $13, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $13, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_13) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13_bwd): > - movaps -0x0d(%rsi), %xmm1 > - > - movaps -0x1d(%rsi), %xmm2 > - palignr $13, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2d(%rsi), %xmm3 > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3d(%rsi), %xmm4 > - palignr $13, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4d(%rsi), %xmm5 > - palignr $13, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5d(%rsi), %xmm6 > - palignr $13, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6d(%rsi), %xmm7 > - palignr $13, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7d(%rsi), %xmm8 > - palignr $13, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8d(%rsi), %xmm9 > - palignr $13, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_13_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14): > - sub $0x80, %rdx > - movaps -0x0e(%rsi), %xmm1 > - movaps 0x02(%rsi), %xmm2 > - movaps 0x12(%rsi), %xmm3 > - movaps 0x22(%rsi), %xmm4 > - movaps 0x32(%rsi), %xmm5 > - movaps 0x42(%rsi), %xmm6 > - movaps 0x52(%rsi), %xmm7 > - movaps 0x62(%rsi), %xmm8 > - movaps 0x72(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $14, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $14, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $14, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $14, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $14, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $14, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $14, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_14) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14_bwd): > - movaps -0x0e(%rsi), %xmm1 > - > - movaps -0x1e(%rsi), %xmm2 > - palignr $14, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2e(%rsi), %xmm3 > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3e(%rsi), %xmm4 > - palignr $14, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4e(%rsi), %xmm5 > - palignr $14, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5e(%rsi), %xmm6 > - palignr $14, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6e(%rsi), %xmm7 > - palignr $14, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7e(%rsi), %xmm8 > - palignr $14, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8e(%rsi), %xmm9 > - palignr $14, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_14_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15): > - sub $0x80, %rdx > - movaps -0x0f(%rsi), %xmm1 > - movaps 0x01(%rsi), %xmm2 > - movaps 0x11(%rsi), %xmm3 > - movaps 0x21(%rsi), %xmm4 > - movaps 0x31(%rsi), %xmm5 > - movaps 0x41(%rsi), %xmm6 > - movaps 0x51(%rsi), %xmm7 > - movaps 0x61(%rsi), %xmm8 > - movaps 0x71(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $15, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $15, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $15, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $15, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $15, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $15, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $15, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_15) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15_bwd): > - movaps -0x0f(%rsi), %xmm1 > - > - movaps -0x1f(%rsi), %xmm2 > - palignr $15, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2f(%rsi), %xmm3 > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3f(%rsi), %xmm4 > - palignr $15, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4f(%rsi), %xmm5 > - palignr $15, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5f(%rsi), %xmm6 > - palignr $15, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6f(%rsi), %xmm7 > - palignr $15, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7f(%rsi), %xmm8 > - palignr $15, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8f(%rsi), %xmm9 > - palignr $15, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_15_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_fwd): > - movdqu (%rsi), %xmm1 > - movdqu %xmm0, (%r8) > - movdqa %xmm1, (%rdi) > - sub $16, %rdx > - add $16, %rsi > - add $16, %rdi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rsi, %r9 > - sub %rdi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_fwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_fwd_start) > -L(memmove_is_memcpy_fwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger_in_fwd) > - mov %rdx, %rcx > -L(bigger_in_fwd): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy_fwd) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy_fwd) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy_fwd): > - sub $0x80, %rdx > -L(gobble_mem_fwd_loop): > - sub $0x80, %rdx > - prefetcht0 0x200(%rsi) > - prefetcht0 0x300(%rsi) > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lfence > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - movntdq %xmm4, 0x40(%rdi) > - movntdq %xmm5, 0x50(%rdi) > - movntdq %xmm6, 0x60(%rdi) > - movntdq %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_mem_fwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_fwd_end) > - add $0x80, %rdx > -L(ll_cache_copy_fwd): > - add %rcx, %rdx > -L(ll_cache_copy_fwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop_fwd): > - prefetchnta 0x1c0(%rsi) > - prefetchnta 0x280(%rsi) > - prefetchnta 0x1c0(%rdi) > - prefetchnta 0x280(%rdi) > - sub $0x80, %rdx > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - movdqa %xmm2, 0x20(%rdi) > - movdqa %xmm3, 0x30(%rdi) > - movdqa %xmm4, 0x40(%rdi) > - movdqa %xmm5, 0x50(%rdi) > - movdqa %xmm6, 0x60(%rdi) > - movdqa %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_ll_loop_fwd) > -L(gobble_mem_fwd_end): > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_bwd): > - add %rdx, %rsi > - add %rdx, %rdi > - > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $-16, %rdi > - sub %rdi, %r9 > - sub %r9, %rsi > - sub %r9, %rdx > - > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rdi, %r9 > - sub %rsi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_bwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_bwd_start) > -L(memmove_is_memcpy_bwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger) > - mov %rdx, %rcx > -L(bigger): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy): > - sub $0x80, %rdx > -L(gobble_mem_bwd_loop): > - sub $0x80, %rdx > - prefetcht0 -0x200(%rsi) > - prefetcht0 -0x300(%rsi) > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - lfence > - movntdq %xmm1, -0x10(%rdi) > - movntdq %xmm2, -0x20(%rdi) > - movntdq %xmm3, -0x30(%rdi) > - movntdq %xmm4, -0x40(%rdi) > - movntdq %xmm5, -0x50(%rdi) > - movntdq %xmm6, -0x60(%rdi) > - movntdq %xmm7, -0x70(%rdi) > - movntdq %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_mem_bwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_bwd_end) > - add $0x80, %rdx > -L(ll_cache_copy): > - add %rcx, %rdx > -L(ll_cache_copy_bwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop): > - prefetchnta -0x1c0(%rsi) > - prefetchnta -0x280(%rsi) > - prefetchnta -0x1c0(%rdi) > - prefetchnta -0x280(%rdi) > - sub $0x80, %rdx > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - movdqa %xmm1, -0x10(%rdi) > - movdqa %xmm2, -0x20(%rdi) > - movdqa %xmm3, -0x30(%rdi) > - movdqa %xmm4, -0x40(%rdi) > - movdqa %xmm5, -0x50(%rdi) > - movdqa %xmm6, -0x60(%rdi) > - movdqa %xmm7, -0x70(%rdi) > - movdqa %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_ll_loop) > -L(gobble_mem_bwd_end): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rsi > - sub %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(fwd_write_128bytes): > - lddqu -128(%rsi), %xmm0 > - movdqu %xmm0, -128(%rdi) > -L(fwd_write_112bytes): > - lddqu -112(%rsi), %xmm0 > - movdqu %xmm0, -112(%rdi) > -L(fwd_write_96bytes): > - lddqu -96(%rsi), %xmm0 > - movdqu %xmm0, -96(%rdi) > -L(fwd_write_80bytes): > - lddqu -80(%rsi), %xmm0 > - movdqu %xmm0, -80(%rdi) > -L(fwd_write_64bytes): > - lddqu -64(%rsi), %xmm0 > - movdqu %xmm0, -64(%rdi) > -L(fwd_write_48bytes): > - lddqu -48(%rsi), %xmm0 > - movdqu %xmm0, -48(%rdi) > -L(fwd_write_32bytes): > - lddqu -32(%rsi), %xmm0 > - movdqu %xmm0, -32(%rdi) > -L(fwd_write_16bytes): > - lddqu -16(%rsi), %xmm0 > - movdqu %xmm0, -16(%rdi) > -L(fwd_write_0bytes): > - ret > - > - > - .p2align 4 > -L(fwd_write_143bytes): > - lddqu -143(%rsi), %xmm0 > - movdqu %xmm0, -143(%rdi) > -L(fwd_write_127bytes): > - lddqu -127(%rsi), %xmm0 > - movdqu %xmm0, -127(%rdi) > -L(fwd_write_111bytes): > - lddqu -111(%rsi), %xmm0 > - movdqu %xmm0, -111(%rdi) > -L(fwd_write_95bytes): > - lddqu -95(%rsi), %xmm0 > - movdqu %xmm0, -95(%rdi) > -L(fwd_write_79bytes): > - lddqu -79(%rsi), %xmm0 > - movdqu %xmm0, -79(%rdi) > -L(fwd_write_63bytes): > - lddqu -63(%rsi), %xmm0 > - movdqu %xmm0, -63(%rdi) > -L(fwd_write_47bytes): > - lddqu -47(%rsi), %xmm0 > - movdqu %xmm0, -47(%rdi) > -L(fwd_write_31bytes): > - lddqu -31(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -31(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_15bytes): > - mov -15(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -15(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_142bytes): > - lddqu -142(%rsi), %xmm0 > - movdqu %xmm0, -142(%rdi) > -L(fwd_write_126bytes): > - lddqu -126(%rsi), %xmm0 > - movdqu %xmm0, -126(%rdi) > -L(fwd_write_110bytes): > - lddqu -110(%rsi), %xmm0 > - movdqu %xmm0, -110(%rdi) > -L(fwd_write_94bytes): > - lddqu -94(%rsi), %xmm0 > - movdqu %xmm0, -94(%rdi) > -L(fwd_write_78bytes): > - lddqu -78(%rsi), %xmm0 > - movdqu %xmm0, -78(%rdi) > -L(fwd_write_62bytes): > - lddqu -62(%rsi), %xmm0 > - movdqu %xmm0, -62(%rdi) > -L(fwd_write_46bytes): > - lddqu -46(%rsi), %xmm0 > - movdqu %xmm0, -46(%rdi) > -L(fwd_write_30bytes): > - lddqu -30(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -30(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_14bytes): > - mov -14(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -14(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_141bytes): > - lddqu -141(%rsi), %xmm0 > - movdqu %xmm0, -141(%rdi) > -L(fwd_write_125bytes): > - lddqu -125(%rsi), %xmm0 > - movdqu %xmm0, -125(%rdi) > -L(fwd_write_109bytes): > - lddqu -109(%rsi), %xmm0 > - movdqu %xmm0, -109(%rdi) > -L(fwd_write_93bytes): > - lddqu -93(%rsi), %xmm0 > - movdqu %xmm0, -93(%rdi) > -L(fwd_write_77bytes): > - lddqu -77(%rsi), %xmm0 > - movdqu %xmm0, -77(%rdi) > -L(fwd_write_61bytes): > - lddqu -61(%rsi), %xmm0 > - movdqu %xmm0, -61(%rdi) > -L(fwd_write_45bytes): > - lddqu -45(%rsi), %xmm0 > - movdqu %xmm0, -45(%rdi) > -L(fwd_write_29bytes): > - lddqu -29(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -29(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_13bytes): > - mov -13(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -13(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_140bytes): > - lddqu -140(%rsi), %xmm0 > - movdqu %xmm0, -140(%rdi) > -L(fwd_write_124bytes): > - lddqu -124(%rsi), %xmm0 > - movdqu %xmm0, -124(%rdi) > -L(fwd_write_108bytes): > - lddqu -108(%rsi), %xmm0 > - movdqu %xmm0, -108(%rdi) > -L(fwd_write_92bytes): > - lddqu -92(%rsi), %xmm0 > - movdqu %xmm0, -92(%rdi) > -L(fwd_write_76bytes): > - lddqu -76(%rsi), %xmm0 > - movdqu %xmm0, -76(%rdi) > -L(fwd_write_60bytes): > - lddqu -60(%rsi), %xmm0 > - movdqu %xmm0, -60(%rdi) > -L(fwd_write_44bytes): > - lddqu -44(%rsi), %xmm0 > - movdqu %xmm0, -44(%rdi) > -L(fwd_write_28bytes): > - lddqu -28(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -28(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_12bytes): > - mov -12(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -12(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_139bytes): > - lddqu -139(%rsi), %xmm0 > - movdqu %xmm0, -139(%rdi) > -L(fwd_write_123bytes): > - lddqu -123(%rsi), %xmm0 > - movdqu %xmm0, -123(%rdi) > -L(fwd_write_107bytes): > - lddqu -107(%rsi), %xmm0 > - movdqu %xmm0, -107(%rdi) > -L(fwd_write_91bytes): > - lddqu -91(%rsi), %xmm0 > - movdqu %xmm0, -91(%rdi) > -L(fwd_write_75bytes): > - lddqu -75(%rsi), %xmm0 > - movdqu %xmm0, -75(%rdi) > -L(fwd_write_59bytes): > - lddqu -59(%rsi), %xmm0 > - movdqu %xmm0, -59(%rdi) > -L(fwd_write_43bytes): > - lddqu -43(%rsi), %xmm0 > - movdqu %xmm0, -43(%rdi) > -L(fwd_write_27bytes): > - lddqu -27(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -27(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_11bytes): > - mov -11(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -11(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_138bytes): > - lddqu -138(%rsi), %xmm0 > - movdqu %xmm0, -138(%rdi) > -L(fwd_write_122bytes): > - lddqu -122(%rsi), %xmm0 > - movdqu %xmm0, -122(%rdi) > -L(fwd_write_106bytes): > - lddqu -106(%rsi), %xmm0 > - movdqu %xmm0, -106(%rdi) > -L(fwd_write_90bytes): > - lddqu -90(%rsi), %xmm0 > - movdqu %xmm0, -90(%rdi) > -L(fwd_write_74bytes): > - lddqu -74(%rsi), %xmm0 > - movdqu %xmm0, -74(%rdi) > -L(fwd_write_58bytes): > - lddqu -58(%rsi), %xmm0 > - movdqu %xmm0, -58(%rdi) > -L(fwd_write_42bytes): > - lddqu -42(%rsi), %xmm0 > - movdqu %xmm0, -42(%rdi) > -L(fwd_write_26bytes): > - lddqu -26(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -26(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_10bytes): > - mov -10(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -10(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_137bytes): > - lddqu -137(%rsi), %xmm0 > - movdqu %xmm0, -137(%rdi) > -L(fwd_write_121bytes): > - lddqu -121(%rsi), %xmm0 > - movdqu %xmm0, -121(%rdi) > -L(fwd_write_105bytes): > - lddqu -105(%rsi), %xmm0 > - movdqu %xmm0, -105(%rdi) > -L(fwd_write_89bytes): > - lddqu -89(%rsi), %xmm0 > - movdqu %xmm0, -89(%rdi) > -L(fwd_write_73bytes): > - lddqu -73(%rsi), %xmm0 > - movdqu %xmm0, -73(%rdi) > -L(fwd_write_57bytes): > - lddqu -57(%rsi), %xmm0 > - movdqu %xmm0, -57(%rdi) > -L(fwd_write_41bytes): > - lddqu -41(%rsi), %xmm0 > - movdqu %xmm0, -41(%rdi) > -L(fwd_write_25bytes): > - lddqu -25(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -25(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_9bytes): > - mov -9(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -9(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_136bytes): > - lddqu -136(%rsi), %xmm0 > - movdqu %xmm0, -136(%rdi) > -L(fwd_write_120bytes): > - lddqu -120(%rsi), %xmm0 > - movdqu %xmm0, -120(%rdi) > -L(fwd_write_104bytes): > - lddqu -104(%rsi), %xmm0 > - movdqu %xmm0, -104(%rdi) > -L(fwd_write_88bytes): > - lddqu -88(%rsi), %xmm0 > - movdqu %xmm0, -88(%rdi) > -L(fwd_write_72bytes): > - lddqu -72(%rsi), %xmm0 > - movdqu %xmm0, -72(%rdi) > -L(fwd_write_56bytes): > - lddqu -56(%rsi), %xmm0 > - movdqu %xmm0, -56(%rdi) > -L(fwd_write_40bytes): > - lddqu -40(%rsi), %xmm0 > - movdqu %xmm0, -40(%rdi) > -L(fwd_write_24bytes): > - lddqu -24(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -24(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_8bytes): > - mov -8(%rsi), %rdx > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_135bytes): > - lddqu -135(%rsi), %xmm0 > - movdqu %xmm0, -135(%rdi) > -L(fwd_write_119bytes): > - lddqu -119(%rsi), %xmm0 > - movdqu %xmm0, -119(%rdi) > -L(fwd_write_103bytes): > - lddqu -103(%rsi), %xmm0 > - movdqu %xmm0, -103(%rdi) > -L(fwd_write_87bytes): > - lddqu -87(%rsi), %xmm0 > - movdqu %xmm0, -87(%rdi) > -L(fwd_write_71bytes): > - lddqu -71(%rsi), %xmm0 > - movdqu %xmm0, -71(%rdi) > -L(fwd_write_55bytes): > - lddqu -55(%rsi), %xmm0 > - movdqu %xmm0, -55(%rdi) > -L(fwd_write_39bytes): > - lddqu -39(%rsi), %xmm0 > - movdqu %xmm0, -39(%rdi) > -L(fwd_write_23bytes): > - lddqu -23(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -23(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_7bytes): > - mov -7(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -7(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_134bytes): > - lddqu -134(%rsi), %xmm0 > - movdqu %xmm0, -134(%rdi) > -L(fwd_write_118bytes): > - lddqu -118(%rsi), %xmm0 > - movdqu %xmm0, -118(%rdi) > -L(fwd_write_102bytes): > - lddqu -102(%rsi), %xmm0 > - movdqu %xmm0, -102(%rdi) > -L(fwd_write_86bytes): > - lddqu -86(%rsi), %xmm0 > - movdqu %xmm0, -86(%rdi) > -L(fwd_write_70bytes): > - lddqu -70(%rsi), %xmm0 > - movdqu %xmm0, -70(%rdi) > -L(fwd_write_54bytes): > - lddqu -54(%rsi), %xmm0 > - movdqu %xmm0, -54(%rdi) > -L(fwd_write_38bytes): > - lddqu -38(%rsi), %xmm0 > - movdqu %xmm0, -38(%rdi) > -L(fwd_write_22bytes): > - lddqu -22(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -22(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_6bytes): > - mov -6(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -6(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_133bytes): > - lddqu -133(%rsi), %xmm0 > - movdqu %xmm0, -133(%rdi) > -L(fwd_write_117bytes): > - lddqu -117(%rsi), %xmm0 > - movdqu %xmm0, -117(%rdi) > -L(fwd_write_101bytes): > - lddqu -101(%rsi), %xmm0 > - movdqu %xmm0, -101(%rdi) > -L(fwd_write_85bytes): > - lddqu -85(%rsi), %xmm0 > - movdqu %xmm0, -85(%rdi) > -L(fwd_write_69bytes): > - lddqu -69(%rsi), %xmm0 > - movdqu %xmm0, -69(%rdi) > -L(fwd_write_53bytes): > - lddqu -53(%rsi), %xmm0 > - movdqu %xmm0, -53(%rdi) > -L(fwd_write_37bytes): > - lddqu -37(%rsi), %xmm0 > - movdqu %xmm0, -37(%rdi) > -L(fwd_write_21bytes): > - lddqu -21(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -21(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_5bytes): > - mov -5(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -5(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_132bytes): > - lddqu -132(%rsi), %xmm0 > - movdqu %xmm0, -132(%rdi) > -L(fwd_write_116bytes): > - lddqu -116(%rsi), %xmm0 > - movdqu %xmm0, -116(%rdi) > -L(fwd_write_100bytes): > - lddqu -100(%rsi), %xmm0 > - movdqu %xmm0, -100(%rdi) > -L(fwd_write_84bytes): > - lddqu -84(%rsi), %xmm0 > - movdqu %xmm0, -84(%rdi) > -L(fwd_write_68bytes): > - lddqu -68(%rsi), %xmm0 > - movdqu %xmm0, -68(%rdi) > -L(fwd_write_52bytes): > - lddqu -52(%rsi), %xmm0 > - movdqu %xmm0, -52(%rdi) > -L(fwd_write_36bytes): > - lddqu -36(%rsi), %xmm0 > - movdqu %xmm0, -36(%rdi) > -L(fwd_write_20bytes): > - lddqu -20(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -20(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_4bytes): > - mov -4(%rsi), %edx > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_131bytes): > - lddqu -131(%rsi), %xmm0 > - movdqu %xmm0, -131(%rdi) > -L(fwd_write_115bytes): > - lddqu -115(%rsi), %xmm0 > - movdqu %xmm0, -115(%rdi) > -L(fwd_write_99bytes): > - lddqu -99(%rsi), %xmm0 > - movdqu %xmm0, -99(%rdi) > -L(fwd_write_83bytes): > - lddqu -83(%rsi), %xmm0 > - movdqu %xmm0, -83(%rdi) > -L(fwd_write_67bytes): > - lddqu -67(%rsi), %xmm0 > - movdqu %xmm0, -67(%rdi) > -L(fwd_write_51bytes): > - lddqu -51(%rsi), %xmm0 > - movdqu %xmm0, -51(%rdi) > -L(fwd_write_35bytes): > - lddqu -35(%rsi), %xmm0 > - movdqu %xmm0, -35(%rdi) > -L(fwd_write_19bytes): > - lddqu -19(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -19(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_3bytes): > - mov -3(%rsi), %dx > - mov -2(%rsi), %cx > - mov %dx, -3(%rdi) > - mov %cx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_130bytes): > - lddqu -130(%rsi), %xmm0 > - movdqu %xmm0, -130(%rdi) > -L(fwd_write_114bytes): > - lddqu -114(%rsi), %xmm0 > - movdqu %xmm0, -114(%rdi) > -L(fwd_write_98bytes): > - lddqu -98(%rsi), %xmm0 > - movdqu %xmm0, -98(%rdi) > -L(fwd_write_82bytes): > - lddqu -82(%rsi), %xmm0 > - movdqu %xmm0, -82(%rdi) > -L(fwd_write_66bytes): > - lddqu -66(%rsi), %xmm0 > - movdqu %xmm0, -66(%rdi) > -L(fwd_write_50bytes): > - lddqu -50(%rsi), %xmm0 > - movdqu %xmm0, -50(%rdi) > -L(fwd_write_34bytes): > - lddqu -34(%rsi), %xmm0 > - movdqu %xmm0, -34(%rdi) > -L(fwd_write_18bytes): > - lddqu -18(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -18(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_2bytes): > - movzwl -2(%rsi), %edx > - mov %dx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_129bytes): > - lddqu -129(%rsi), %xmm0 > - movdqu %xmm0, -129(%rdi) > -L(fwd_write_113bytes): > - lddqu -113(%rsi), %xmm0 > - movdqu %xmm0, -113(%rdi) > -L(fwd_write_97bytes): > - lddqu -97(%rsi), %xmm0 > - movdqu %xmm0, -97(%rdi) > -L(fwd_write_81bytes): > - lddqu -81(%rsi), %xmm0 > - movdqu %xmm0, -81(%rdi) > -L(fwd_write_65bytes): > - lddqu -65(%rsi), %xmm0 > - movdqu %xmm0, -65(%rdi) > -L(fwd_write_49bytes): > - lddqu -49(%rsi), %xmm0 > - movdqu %xmm0, -49(%rdi) > -L(fwd_write_33bytes): > - lddqu -33(%rsi), %xmm0 > - movdqu %xmm0, -33(%rdi) > -L(fwd_write_17bytes): > - lddqu -17(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -17(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_1bytes): > - movzbl -1(%rsi), %edx > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_128bytes): > - lddqu 112(%rsi), %xmm0 > - movdqu %xmm0, 112(%rdi) > -L(bwd_write_112bytes): > - lddqu 96(%rsi), %xmm0 > - movdqu %xmm0, 96(%rdi) > -L(bwd_write_96bytes): > - lddqu 80(%rsi), %xmm0 > - movdqu %xmm0, 80(%rdi) > -L(bwd_write_80bytes): > - lddqu 64(%rsi), %xmm0 > - movdqu %xmm0, 64(%rdi) > -L(bwd_write_64bytes): > - lddqu 48(%rsi), %xmm0 > - movdqu %xmm0, 48(%rdi) > -L(bwd_write_48bytes): > - lddqu 32(%rsi), %xmm0 > - movdqu %xmm0, 32(%rdi) > -L(bwd_write_32bytes): > - lddqu 16(%rsi), %xmm0 > - movdqu %xmm0, 16(%rdi) > -L(bwd_write_16bytes): > - lddqu (%rsi), %xmm0 > - movdqu %xmm0, (%rdi) > -L(bwd_write_0bytes): > - ret > - > - .p2align 4 > -L(bwd_write_143bytes): > - lddqu 127(%rsi), %xmm0 > - movdqu %xmm0, 127(%rdi) > -L(bwd_write_127bytes): > - lddqu 111(%rsi), %xmm0 > - movdqu %xmm0, 111(%rdi) > -L(bwd_write_111bytes): > - lddqu 95(%rsi), %xmm0 > - movdqu %xmm0, 95(%rdi) > -L(bwd_write_95bytes): > - lddqu 79(%rsi), %xmm0 > - movdqu %xmm0, 79(%rdi) > -L(bwd_write_79bytes): > - lddqu 63(%rsi), %xmm0 > - movdqu %xmm0, 63(%rdi) > -L(bwd_write_63bytes): > - lddqu 47(%rsi), %xmm0 > - movdqu %xmm0, 47(%rdi) > -L(bwd_write_47bytes): > - lddqu 31(%rsi), %xmm0 > - movdqu %xmm0, 31(%rdi) > -L(bwd_write_31bytes): > - lddqu 15(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 15(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - > - .p2align 4 > -L(bwd_write_15bytes): > - mov 7(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 7(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_142bytes): > - lddqu 126(%rsi), %xmm0 > - movdqu %xmm0, 126(%rdi) > -L(bwd_write_126bytes): > - lddqu 110(%rsi), %xmm0 > - movdqu %xmm0, 110(%rdi) > -L(bwd_write_110bytes): > - lddqu 94(%rsi), %xmm0 > - movdqu %xmm0, 94(%rdi) > -L(bwd_write_94bytes): > - lddqu 78(%rsi), %xmm0 > - movdqu %xmm0, 78(%rdi) > -L(bwd_write_78bytes): > - lddqu 62(%rsi), %xmm0 > - movdqu %xmm0, 62(%rdi) > -L(bwd_write_62bytes): > - lddqu 46(%rsi), %xmm0 > - movdqu %xmm0, 46(%rdi) > -L(bwd_write_46bytes): > - lddqu 30(%rsi), %xmm0 > - movdqu %xmm0, 30(%rdi) > -L(bwd_write_30bytes): > - lddqu 14(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 14(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_14bytes): > - mov 6(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 6(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_141bytes): > - lddqu 125(%rsi), %xmm0 > - movdqu %xmm0, 125(%rdi) > -L(bwd_write_125bytes): > - lddqu 109(%rsi), %xmm0 > - movdqu %xmm0, 109(%rdi) > -L(bwd_write_109bytes): > - lddqu 93(%rsi), %xmm0 > - movdqu %xmm0, 93(%rdi) > -L(bwd_write_93bytes): > - lddqu 77(%rsi), %xmm0 > - movdqu %xmm0, 77(%rdi) > -L(bwd_write_77bytes): > - lddqu 61(%rsi), %xmm0 > - movdqu %xmm0, 61(%rdi) > -L(bwd_write_61bytes): > - lddqu 45(%rsi), %xmm0 > - movdqu %xmm0, 45(%rdi) > -L(bwd_write_45bytes): > - lddqu 29(%rsi), %xmm0 > - movdqu %xmm0, 29(%rdi) > -L(bwd_write_29bytes): > - lddqu 13(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 13(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_13bytes): > - mov 5(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 5(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_140bytes): > - lddqu 124(%rsi), %xmm0 > - movdqu %xmm0, 124(%rdi) > -L(bwd_write_124bytes): > - lddqu 108(%rsi), %xmm0 > - movdqu %xmm0, 108(%rdi) > -L(bwd_write_108bytes): > - lddqu 92(%rsi), %xmm0 > - movdqu %xmm0, 92(%rdi) > -L(bwd_write_92bytes): > - lddqu 76(%rsi), %xmm0 > - movdqu %xmm0, 76(%rdi) > -L(bwd_write_76bytes): > - lddqu 60(%rsi), %xmm0 > - movdqu %xmm0, 60(%rdi) > -L(bwd_write_60bytes): > - lddqu 44(%rsi), %xmm0 > - movdqu %xmm0, 44(%rdi) > -L(bwd_write_44bytes): > - lddqu 28(%rsi), %xmm0 > - movdqu %xmm0, 28(%rdi) > -L(bwd_write_28bytes): > - lddqu 12(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 12(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_12bytes): > - mov 4(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 4(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_139bytes): > - lddqu 123(%rsi), %xmm0 > - movdqu %xmm0, 123(%rdi) > -L(bwd_write_123bytes): > - lddqu 107(%rsi), %xmm0 > - movdqu %xmm0, 107(%rdi) > -L(bwd_write_107bytes): > - lddqu 91(%rsi), %xmm0 > - movdqu %xmm0, 91(%rdi) > -L(bwd_write_91bytes): > - lddqu 75(%rsi), %xmm0 > - movdqu %xmm0, 75(%rdi) > -L(bwd_write_75bytes): > - lddqu 59(%rsi), %xmm0 > - movdqu %xmm0, 59(%rdi) > -L(bwd_write_59bytes): > - lddqu 43(%rsi), %xmm0 > - movdqu %xmm0, 43(%rdi) > -L(bwd_write_43bytes): > - lddqu 27(%rsi), %xmm0 > - movdqu %xmm0, 27(%rdi) > -L(bwd_write_27bytes): > - lddqu 11(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 11(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_11bytes): > - mov 3(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 3(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_138bytes): > - lddqu 122(%rsi), %xmm0 > - movdqu %xmm0, 122(%rdi) > -L(bwd_write_122bytes): > - lddqu 106(%rsi), %xmm0 > - movdqu %xmm0, 106(%rdi) > -L(bwd_write_106bytes): > - lddqu 90(%rsi), %xmm0 > - movdqu %xmm0, 90(%rdi) > -L(bwd_write_90bytes): > - lddqu 74(%rsi), %xmm0 > - movdqu %xmm0, 74(%rdi) > -L(bwd_write_74bytes): > - lddqu 58(%rsi), %xmm0 > - movdqu %xmm0, 58(%rdi) > -L(bwd_write_58bytes): > - lddqu 42(%rsi), %xmm0 > - movdqu %xmm0, 42(%rdi) > -L(bwd_write_42bytes): > - lddqu 26(%rsi), %xmm0 > - movdqu %xmm0, 26(%rdi) > -L(bwd_write_26bytes): > - lddqu 10(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 10(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_10bytes): > - mov 2(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 2(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_137bytes): > - lddqu 121(%rsi), %xmm0 > - movdqu %xmm0, 121(%rdi) > -L(bwd_write_121bytes): > - lddqu 105(%rsi), %xmm0 > - movdqu %xmm0, 105(%rdi) > -L(bwd_write_105bytes): > - lddqu 89(%rsi), %xmm0 > - movdqu %xmm0, 89(%rdi) > -L(bwd_write_89bytes): > - lddqu 73(%rsi), %xmm0 > - movdqu %xmm0, 73(%rdi) > -L(bwd_write_73bytes): > - lddqu 57(%rsi), %xmm0 > - movdqu %xmm0, 57(%rdi) > -L(bwd_write_57bytes): > - lddqu 41(%rsi), %xmm0 > - movdqu %xmm0, 41(%rdi) > -L(bwd_write_41bytes): > - lddqu 25(%rsi), %xmm0 > - movdqu %xmm0, 25(%rdi) > -L(bwd_write_25bytes): > - lddqu 9(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 9(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_9bytes): > - mov 1(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 1(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_136bytes): > - lddqu 120(%rsi), %xmm0 > - movdqu %xmm0, 120(%rdi) > -L(bwd_write_120bytes): > - lddqu 104(%rsi), %xmm0 > - movdqu %xmm0, 104(%rdi) > -L(bwd_write_104bytes): > - lddqu 88(%rsi), %xmm0 > - movdqu %xmm0, 88(%rdi) > -L(bwd_write_88bytes): > - lddqu 72(%rsi), %xmm0 > - movdqu %xmm0, 72(%rdi) > -L(bwd_write_72bytes): > - lddqu 56(%rsi), %xmm0 > - movdqu %xmm0, 56(%rdi) > -L(bwd_write_56bytes): > - lddqu 40(%rsi), %xmm0 > - movdqu %xmm0, 40(%rdi) > -L(bwd_write_40bytes): > - lddqu 24(%rsi), %xmm0 > - movdqu %xmm0, 24(%rdi) > -L(bwd_write_24bytes): > - lddqu 8(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 8(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_8bytes): > - mov (%rsi), %rdx > - mov %rdx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_135bytes): > - lddqu 119(%rsi), %xmm0 > - movdqu %xmm0, 119(%rdi) > -L(bwd_write_119bytes): > - lddqu 103(%rsi), %xmm0 > - movdqu %xmm0, 103(%rdi) > -L(bwd_write_103bytes): > - lddqu 87(%rsi), %xmm0 > - movdqu %xmm0, 87(%rdi) > -L(bwd_write_87bytes): > - lddqu 71(%rsi), %xmm0 > - movdqu %xmm0, 71(%rdi) > -L(bwd_write_71bytes): > - lddqu 55(%rsi), %xmm0 > - movdqu %xmm0, 55(%rdi) > -L(bwd_write_55bytes): > - lddqu 39(%rsi), %xmm0 > - movdqu %xmm0, 39(%rdi) > -L(bwd_write_39bytes): > - lddqu 23(%rsi), %xmm0 > - movdqu %xmm0, 23(%rdi) > -L(bwd_write_23bytes): > - lddqu 7(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 7(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_7bytes): > - mov 3(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 3(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_134bytes): > - lddqu 118(%rsi), %xmm0 > - movdqu %xmm0, 118(%rdi) > -L(bwd_write_118bytes): > - lddqu 102(%rsi), %xmm0 > - movdqu %xmm0, 102(%rdi) > -L(bwd_write_102bytes): > - lddqu 86(%rsi), %xmm0 > - movdqu %xmm0, 86(%rdi) > -L(bwd_write_86bytes): > - lddqu 70(%rsi), %xmm0 > - movdqu %xmm0, 70(%rdi) > -L(bwd_write_70bytes): > - lddqu 54(%rsi), %xmm0 > - movdqu %xmm0, 54(%rdi) > -L(bwd_write_54bytes): > - lddqu 38(%rsi), %xmm0 > - movdqu %xmm0, 38(%rdi) > -L(bwd_write_38bytes): > - lddqu 22(%rsi), %xmm0 > - movdqu %xmm0, 22(%rdi) > -L(bwd_write_22bytes): > - lddqu 6(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 6(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_6bytes): > - mov 2(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 2(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_133bytes): > - lddqu 117(%rsi), %xmm0 > - movdqu %xmm0, 117(%rdi) > -L(bwd_write_117bytes): > - lddqu 101(%rsi), %xmm0 > - movdqu %xmm0, 101(%rdi) > -L(bwd_write_101bytes): > - lddqu 85(%rsi), %xmm0 > - movdqu %xmm0, 85(%rdi) > -L(bwd_write_85bytes): > - lddqu 69(%rsi), %xmm0 > - movdqu %xmm0, 69(%rdi) > -L(bwd_write_69bytes): > - lddqu 53(%rsi), %xmm0 > - movdqu %xmm0, 53(%rdi) > -L(bwd_write_53bytes): > - lddqu 37(%rsi), %xmm0 > - movdqu %xmm0, 37(%rdi) > -L(bwd_write_37bytes): > - lddqu 21(%rsi), %xmm0 > - movdqu %xmm0, 21(%rdi) > -L(bwd_write_21bytes): > - lddqu 5(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 5(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_5bytes): > - mov 1(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 1(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_132bytes): > - lddqu 116(%rsi), %xmm0 > - movdqu %xmm0, 116(%rdi) > -L(bwd_write_116bytes): > - lddqu 100(%rsi), %xmm0 > - movdqu %xmm0, 100(%rdi) > -L(bwd_write_100bytes): > - lddqu 84(%rsi), %xmm0 > - movdqu %xmm0, 84(%rdi) > -L(bwd_write_84bytes): > - lddqu 68(%rsi), %xmm0 > - movdqu %xmm0, 68(%rdi) > -L(bwd_write_68bytes): > - lddqu 52(%rsi), %xmm0 > - movdqu %xmm0, 52(%rdi) > -L(bwd_write_52bytes): > - lddqu 36(%rsi), %xmm0 > - movdqu %xmm0, 36(%rdi) > -L(bwd_write_36bytes): > - lddqu 20(%rsi), %xmm0 > - movdqu %xmm0, 20(%rdi) > -L(bwd_write_20bytes): > - lddqu 4(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 4(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_4bytes): > - mov (%rsi), %edx > - mov %edx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_131bytes): > - lddqu 115(%rsi), %xmm0 > - movdqu %xmm0, 115(%rdi) > -L(bwd_write_115bytes): > - lddqu 99(%rsi), %xmm0 > - movdqu %xmm0, 99(%rdi) > -L(bwd_write_99bytes): > - lddqu 83(%rsi), %xmm0 > - movdqu %xmm0, 83(%rdi) > -L(bwd_write_83bytes): > - lddqu 67(%rsi), %xmm0 > - movdqu %xmm0, 67(%rdi) > -L(bwd_write_67bytes): > - lddqu 51(%rsi), %xmm0 > - movdqu %xmm0, 51(%rdi) > -L(bwd_write_51bytes): > - lddqu 35(%rsi), %xmm0 > - movdqu %xmm0, 35(%rdi) > -L(bwd_write_35bytes): > - lddqu 19(%rsi), %xmm0 > - movdqu %xmm0, 19(%rdi) > -L(bwd_write_19bytes): > - lddqu 3(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 3(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_3bytes): > - mov 1(%rsi), %dx > - mov (%rsi), %cx > - mov %dx, 1(%rdi) > - mov %cx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_130bytes): > - lddqu 114(%rsi), %xmm0 > - movdqu %xmm0, 114(%rdi) > -L(bwd_write_114bytes): > - lddqu 98(%rsi), %xmm0 > - movdqu %xmm0, 98(%rdi) > -L(bwd_write_98bytes): > - lddqu 82(%rsi), %xmm0 > - movdqu %xmm0, 82(%rdi) > -L(bwd_write_82bytes): > - lddqu 66(%rsi), %xmm0 > - movdqu %xmm0, 66(%rdi) > -L(bwd_write_66bytes): > - lddqu 50(%rsi), %xmm0 > - movdqu %xmm0, 50(%rdi) > -L(bwd_write_50bytes): > - lddqu 34(%rsi), %xmm0 > - movdqu %xmm0, 34(%rdi) > -L(bwd_write_34bytes): > - lddqu 18(%rsi), %xmm0 > - movdqu %xmm0, 18(%rdi) > -L(bwd_write_18bytes): > - lddqu 2(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 2(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_2bytes): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_129bytes): > - lddqu 113(%rsi), %xmm0 > - movdqu %xmm0, 113(%rdi) > -L(bwd_write_113bytes): > - lddqu 97(%rsi), %xmm0 > - movdqu %xmm0, 97(%rdi) > -L(bwd_write_97bytes): > - lddqu 81(%rsi), %xmm0 > - movdqu %xmm0, 81(%rdi) > -L(bwd_write_81bytes): > - lddqu 65(%rsi), %xmm0 > - movdqu %xmm0, 65(%rdi) > -L(bwd_write_65bytes): > - lddqu 49(%rsi), %xmm0 > - movdqu %xmm0, 49(%rdi) > -L(bwd_write_49bytes): > - lddqu 33(%rsi), %xmm0 > - movdqu %xmm0, 33(%rdi) > -L(bwd_write_33bytes): > - lddqu 17(%rsi), %xmm0 > - movdqu %xmm0, 17(%rdi) > -L(bwd_write_17bytes): > - lddqu 1(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 1(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_1bytes): > - movzbl (%rsi), %edx > - mov %dl, (%rdi) > - ret > - > -END (MEMCPY) > - > - .section .rodata.ssse3,"a",@progbits > - .p2align 3 > -L(table_144_bytes_bwd): > - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) > - > - .p2align 3 > -L(table_144_bytes_fwd): > - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) > - > - .p2align 3 > -L(shl_table_fwd): > - .int JMPTBL (L(shl_0), L(shl_table_fwd)) > - .int JMPTBL (L(shl_1), L(shl_table_fwd)) > - .int JMPTBL (L(shl_2), L(shl_table_fwd)) > - .int JMPTBL (L(shl_3), L(shl_table_fwd)) > - .int JMPTBL (L(shl_4), L(shl_table_fwd)) > - .int JMPTBL (L(shl_5), L(shl_table_fwd)) > - .int JMPTBL (L(shl_6), L(shl_table_fwd)) > - .int JMPTBL (L(shl_7), L(shl_table_fwd)) > - .int JMPTBL (L(shl_8), L(shl_table_fwd)) > - .int JMPTBL (L(shl_9), L(shl_table_fwd)) > - .int JMPTBL (L(shl_10), L(shl_table_fwd)) > - .int JMPTBL (L(shl_11), L(shl_table_fwd)) > - .int JMPTBL (L(shl_12), L(shl_table_fwd)) > - .int JMPTBL (L(shl_13), L(shl_table_fwd)) > - .int JMPTBL (L(shl_14), L(shl_table_fwd)) > - .int JMPTBL (L(shl_15), L(shl_table_fwd)) > - > - .p2align 3 > -L(shl_table_bwd): > - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) > - > -#endif > diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > deleted file mode 100644 > index f9a4e9aff9..0000000000 > --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_MEMMOVE > -#define MEMCPY __memmove_ssse3_back > -#define MEMCPY_CHK __memmove_chk_ssse3_back > -#include "memcpy-ssse3-back.S" > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (3 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein ` (4 subsequent siblings) 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.962 bench-memcpy-random geometric_mean(N=50) New / Original: 0.895 bench-memcpy-large geometric_mean(N=50) New / Original: 0.894 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries. --- sysdeps/x86_64/multiarch/Makefile | 1 - sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ---------------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++- 3 files changed, 382 insertions(+), 3156 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 303fb5d734..e7ea963fc0 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,6 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3 \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S deleted file mode 100644 index 65644d3a09..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3151 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif -L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(80bytesormore): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - - .p2align 4 -L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - - .p2align 4 -L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_bwd): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) -L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 -L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 -L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 -L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 -L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 -L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 -L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 -L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 -L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 -L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 -L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 -L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 -L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 -L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 -L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 -L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 -L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 -L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 -L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 -L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 -L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 -L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 -L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 -L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 -L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 -L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 -L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 -L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 -L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 -L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 -L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 -L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 -L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -L(write_0bytes): - ret - - .p2align 4 -L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif -L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#endif - .p2align 4 -L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif -L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - - .p2align 3 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 295430b1ef..84e4e0f6cb 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -1,4 +1,382 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" +#include <sysdep.h> + +#ifndef MEMMOVE +# define MEMMOVE __memmove_ssse3 +# define MEMMOVE_CHK __memmove_chk_ssse3 +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + + .section .text.ssse3, "ax", @progbits +ENTRY(MEMPCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMPCPY_CHK) + +ENTRY(MEMPCPY) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END(MEMPCPY) + +ENTRY(MEMMOVE_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMMOVE_CHK) + +ENTRY_P2ALIGN(MEMMOVE, 6) + movq %rdi, %rax +L(start): + cmpq $16, %rdx + jb L(copy_0_15) + + /* These loads are always useful. */ + movups 0(%rsi), %xmm0 + movups -16(%rsi, %rdx), %xmm7 + cmpq $32, %rdx + ja L(more_2x_vec) + + movups %xmm0, 0(%rdi) + movups %xmm7, -16(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_4x_vec): + movups 16(%rsi), %xmm1 + movups -32(%rsi, %rdx), %xmm2 + + movups %xmm0, 0(%rdi) + movups %xmm1, 16(%rdi) + movups %xmm2, -32(%rdi, %rdx) + movups %xmm7, -16(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_0_15): + cmpl $8, %edx + ja L(copy_9_15) + + cmpl $4, %edx + jb L(copy_0_3) + + movl 0(%rsi), %ecx + movl -4(%rsi, %rdx), %esi + movl %ecx, 0(%rdi) + movl %esi, -4(%rdi, %rdx) + ret + + .p2align 4,, 8 +L(copy_9_15): + movq 0(%rsi), %rcx + movq -8(%rsi, %rdx), %rsi + movq %rcx, 0(%rdi) + movq %rsi, -8(%rdi, %rdx) + ret + + .p2align 4,, 4 +L(copy_0_3): + cmpl $1, %edx + jl L(copy_0_0) + movzbl (%rsi), %ecx + je L(copy_0_1) + + movzwl -2(%rsi, %rdx), %esi + movw %si, -2(%rdi, %rdx) +L(copy_0_1): + movb %cl, (%rdi) +L(copy_0_0): +L(nop): + ret + + .p2align 4 +L(more_2x_vec): + cmpq $64, %rdx + jbe L(copy_4x_vec) + + /* We use rcx later to get alignr value. */ + movq %rdi, %rcx + + /* Backward copy for overlap + dst > src for memmove safety. */ + subq %rsi, %rcx + cmpq %rdx, %rcx + jb L(copy_backward) + + /* Load tail. */ + + /* -16(%rsi, %rdx) already loaded into xmm7. */ + movups -32(%rsi, %rdx), %xmm8 + movups -48(%rsi, %rdx), %xmm9 + + /* Get misalignment. */ + andl $0xf, %ecx + + movq %rsi, %r9 + addq %rcx, %rsi + andq $-16, %rsi + /* Get first vec for `palignr`. */ + movaps (%rsi), %xmm1 + + /* We have loaded (%rsi) so safe to do this store before the + loop. */ + movups %xmm0, (%rdi) + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_shared_cache_size_half(%rip), %rdx +#endif + ja L(large_memcpy) + + leaq -64(%rdi, %rdx), %r8 + andq $-16, %rdi + movl $48, %edx + + leaq L(loop_fwd_start)(%rip), %r9 + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(copy_backward): + testq %rcx, %rcx + jz L(nop) + + /* Preload tail. */ + + /* (%rsi) already loaded into xmm0. */ + movups 16(%rsi), %xmm4 + movups 32(%rsi), %xmm5 + + movq %rdi, %r8 + subq %rdi, %rsi + leaq -49(%rdi, %rdx), %rdi + andq $-16, %rdi + addq %rdi, %rsi + andq $-16, %rsi + + movaps 48(%rsi), %xmm6 + + + leaq L(loop_bkwd_start)(%rip), %r9 + andl $0xf, %ecx + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(large_memcpy): + movups -64(%r9, %rdx), %xmm10 + movups -80(%r9, %rdx), %xmm11 + + sall $5, %ecx + leal (%rcx, %rcx, 2), %r8d + leaq -96(%rdi, %rdx), %rcx + andq $-16, %rdi + leaq L(large_loop_fwd_start)(%rip), %rdx + addq %r8, %rdx + jmp * %rdx + + + /* Instead of a typical jump table all 16 loops are exactly + 64-bytes in size. So, we can just jump to first loop + r8 * + 64. Before modifying any loop ensure all their sizes match! + */ + .p2align 6 +L(loop_fwd_start): +L(loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + addq %rdx, %rdi + addq %rdx, %rsi + cmpq %rdi, %r8 + ja L(loop_fwd_0x0) +L(end_loop_fwd): + movups %xmm9, 16(%r8) + movups %xmm8, 32(%r8) + movups %xmm7, 48(%r8) + ret + + /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_FWD(align_by); \ + .p2align 6; \ +L(loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm4, %xmm1; \ + movaps %xmm0, 16(%rdi); \ + movaps %xmm2, 32(%rdi); \ + movaps %xmm3, 48(%rdi); \ + addq %rdx, %rdi; \ + addq %rdx, %rsi; \ + cmpq %rdi, %r8; \ + ja L(loop_fwd_ ## align_by); \ + jmp L(end_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_FWD (0xf) + ALIGNED_LOOP_FWD (0xe) + ALIGNED_LOOP_FWD (0xd) + ALIGNED_LOOP_FWD (0xc) + ALIGNED_LOOP_FWD (0xb) + ALIGNED_LOOP_FWD (0xa) + ALIGNED_LOOP_FWD (0x9) + ALIGNED_LOOP_FWD (0x8) + ALIGNED_LOOP_FWD (0x7) + ALIGNED_LOOP_FWD (0x6) + ALIGNED_LOOP_FWD (0x5) + ALIGNED_LOOP_FWD (0x4) + ALIGNED_LOOP_FWD (0x3) + ALIGNED_LOOP_FWD (0x2) + ALIGNED_LOOP_FWD (0x1) + + .p2align 6 +L(large_loop_fwd_start): +L(large_loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps 64(%rsi), %xmm4 + movaps 80(%rsi), %xmm5 + movntps %xmm1, 16(%rdi) + movntps %xmm2, 32(%rdi) + movntps %xmm3, 48(%rdi) + movntps %xmm4, 64(%rdi) + movntps %xmm5, 80(%rdi) + addq $80, %rdi + addq $80, %rsi + cmpq %rdi, %rcx + ja L(large_loop_fwd_0x0) + + /* Ensure no icache line split on tail. */ + .p2align 4 +L(end_large_loop_fwd): + sfence + movups %xmm11, 16(%rcx) + movups %xmm10, 32(%rcx) + movups %xmm9, 48(%rcx) + movups %xmm8, 64(%rcx) + movups %xmm7, 80(%rcx) + ret + + + /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure + 96-byte spacing between each. */ +#define ALIGNED_LARGE_LOOP_FWD(align_by); \ + .p2align 5; \ +L(large_loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps 64(%rsi), %xmm4; \ + movaps 80(%rsi), %xmm5; \ + movaps %xmm5, %xmm6; \ + palignr $align_by, %xmm4, %xmm5; \ + palignr $align_by, %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm6, %xmm1; \ + movntps %xmm0, 16(%rdi); \ + movntps %xmm2, 32(%rdi); \ + movntps %xmm3, 48(%rdi); \ + movntps %xmm4, 64(%rdi); \ + movntps %xmm5, 80(%rdi); \ + addq $80, %rdi; \ + addq $80, %rsi; \ + cmpq %rdi, %rcx; \ + ja L(large_loop_fwd_ ## align_by); \ + jmp L(end_large_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LARGE_LOOP_FWD (0xf) + ALIGNED_LARGE_LOOP_FWD (0xe) + ALIGNED_LARGE_LOOP_FWD (0xd) + ALIGNED_LARGE_LOOP_FWD (0xc) + ALIGNED_LARGE_LOOP_FWD (0xb) + ALIGNED_LARGE_LOOP_FWD (0xa) + ALIGNED_LARGE_LOOP_FWD (0x9) + ALIGNED_LARGE_LOOP_FWD (0x8) + ALIGNED_LARGE_LOOP_FWD (0x7) + ALIGNED_LARGE_LOOP_FWD (0x6) + ALIGNED_LARGE_LOOP_FWD (0x5) + ALIGNED_LARGE_LOOP_FWD (0x4) + ALIGNED_LARGE_LOOP_FWD (0x3) + ALIGNED_LARGE_LOOP_FWD (0x2) + ALIGNED_LARGE_LOOP_FWD (0x1) + + + .p2align 6 +L(loop_bkwd_start): +L(loop_bkwd_0x0): + movaps 32(%rsi), %xmm1 + movaps 16(%rsi), %xmm2 + movaps 0(%rsi), %xmm3 + movaps %xmm1, 32(%rdi) + movaps %xmm2, 16(%rdi) + movaps %xmm3, 0(%rdi) + subq $48, %rdi + subq $48, %rsi + cmpq %rdi, %r8 + jb L(loop_bkwd_0x0) +L(end_loop_bkwd): + movups %xmm7, -16(%r8, %rdx) + movups %xmm0, 0(%r8) + movups %xmm4, 16(%r8) + movups %xmm5, 32(%r8) + + ret + + + /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_BKWD(align_by); \ + .p2align 6; \ +L(loop_bkwd_ ## align_by): \ + movaps 32(%rsi), %xmm1; \ + movaps 16(%rsi), %xmm2; \ + movaps 0(%rsi), %xmm3; \ + palignr $align_by, %xmm1, %xmm6; \ + palignr $align_by, %xmm2, %xmm1; \ + palignr $align_by, %xmm3, %xmm2; \ + movaps %xmm6, 32(%rdi); \ + movaps %xmm1, 16(%rdi); \ + movaps %xmm2, 0(%rdi); \ + subq $48, %rdi; \ + subq $48, %rsi; \ + movaps %xmm3, %xmm6; \ + cmpq %rdi, %r8; \ + jb L(loop_bkwd_ ## align_by); \ + jmp L(end_loop_bkwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_BKWD (0xf) + ALIGNED_LOOP_BKWD (0xe) + ALIGNED_LOOP_BKWD (0xd) + ALIGNED_LOOP_BKWD (0xc) + ALIGNED_LOOP_BKWD (0xb) + ALIGNED_LOOP_BKWD (0xa) + ALIGNED_LOOP_BKWD (0x9) + ALIGNED_LOOP_BKWD (0x8) + ALIGNED_LOOP_BKWD (0x7) + ALIGNED_LOOP_BKWD (0x6) + ALIGNED_LOOP_BKWD (0x5) + ALIGNED_LOOP_BKWD (0x4) + ALIGNED_LOOP_BKWD (0x3) + ALIGNED_LOOP_BKWD (0x2) + ALIGNED_LOOP_BKWD (0x1) +END(MEMMOVE) + +strong_alias (MEMMOVE, MEMCPY) +strong_alias (MEMMOVE_CHK, MEMCPY_CHK) -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are > generally preferable. memcpy/memmove is one exception where avoiding > unaligned loads with `palignr` is important for some targets. > > This commit replaces memmove-ssse3 with a better optimized are lower > code footprint verion. As well it aliases memcpy to memmove. > > Aside from this function all other SSSE3 functions should be safe to > remove. > > The performance is not changed drastically although shows overall > improvements without any major regressions or gains. > > bench-memcpy geometric_mean(N=50) New / Original: 0.962 > > bench-memcpy-random geometric_mean(N=50) New / Original: 0.895 > > bench-memcpy-large geometric_mean(N=50) New / Original: 0.894 > > Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers > for all results. > > More important this saves 7246 bytes of code size in memmove an > additional 10741 bytes by reusing memmove code for memcpy (total 17987 > bytes saves). As well an additional 896 bytes of rodata for the jump > table entries. > --- > sysdeps/x86_64/multiarch/Makefile | 1 - > sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ---------------------- > sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++- > 3 files changed, 382 insertions(+), 3156 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 303fb5d734..e7ea963fc0 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -16,7 +16,6 @@ sysdep_routines += \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > memcmpeq-sse2 \ > - memcpy-ssse3 \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S > deleted file mode 100644 > index 65644d3a09..0000000000 > --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S > +++ /dev/null > @@ -1,3151 +0,0 @@ > -/* memcpy with SSSE3 > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#if IS_IN (libc) > - > -#include "asm-syntax.h" > - > -#ifndef MEMCPY > -# define MEMCPY __memcpy_ssse3 > -# define MEMCPY_CHK __memcpy_chk_ssse3 > -# define MEMPCPY __mempcpy_ssse3 > -# define MEMPCPY_CHK __mempcpy_chk_ssse3 > -#endif > - > -#define JMPTBL(I, B) I - B > - > -/* Branch to an entry in a jump table. TABLE is a jump table with > - relative offsets. INDEX is a register contains the index into the > - jump table. SCALE is the scale of INDEX. */ > -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), INDEX; \ > - lea (%r11, INDEX), INDEX; \ > - _CET_NOTRACK jmp *INDEX; \ > - ud2 > - > - .section .text.ssse3,"ax",@progbits > -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE > -ENTRY (MEMPCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMPCPY_CHK) > - > -ENTRY (MEMPCPY) > - mov %RDI_LP, %RAX_LP > - add %RDX_LP, %RAX_LP > - jmp L(start) > -END (MEMPCPY) > -#endif > - > -#if !defined USE_AS_BCOPY > -ENTRY (MEMCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMCPY_CHK) > -#endif > - > -ENTRY (MEMCPY) > - mov %RDI_LP, %RAX_LP > -#ifdef USE_AS_MEMPCPY > - add %RDX_LP, %RAX_LP > -#endif > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -#endif > - > -#ifdef USE_AS_MEMMOVE > - cmp %rsi, %rdi > - jb L(copy_forward) > - je L(write_0bytes) > - cmp $79, %rdx > - jbe L(copy_forward) > - jmp L(copy_backward) > -L(copy_forward): > -#endif > -L(start): > - cmp $79, %rdx > - lea L(table_less_80bytes)(%rip), %r11 > - ja L(80bytesormore) > - movslq (%r11, %rdx, 4), %r9 > - add %rdx, %rsi > - add %rdx, %rdi > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(80bytesormore): > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jle L(copy_backward) > -#endif > - > - movdqu (%rsi), %xmm0 > - mov %rdi, %rcx > - and $-16, %rdi > - add $16, %rdi > - mov %rcx, %r8 > - sub %rdi, %rcx > - add %rcx, %rdx > - sub %rcx, %rsi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > - cmp %rcx, %rdx > - mov %rsi, %r9 > - ja L(large_page_fwd) > - and $0xf, %r9 > - jz L(shl_0) > -#ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_data_cache_size_half(%rip), %RCX_LP > -#endif > - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) > - > - .p2align 4 > -L(copy_backward): > - movdqu -16(%rsi, %rdx), %xmm0 > - add %rdx, %rsi > - lea -16(%rdi, %rdx), %r8 > - add %rdx, %rdi > - > - mov %rdi, %rcx > - and $0xf, %rcx > - xor %rcx, %rdi > - sub %rcx, %rdx > - sub %rcx, %rsi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > - > - cmp %rcx, %rdx > - mov %rsi, %r9 > - ja L(large_page_bwd) > - and $0xf, %r9 > - jz L(shl_0_bwd) > -#ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_data_cache_size_half(%rip), %RCX_LP > -#endif > - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) > - > - .p2align 4 > -L(shl_0): > - sub $16, %rdx > - movdqa (%rsi), %xmm1 > - add $16, %rsi > - movdqa %xmm1, (%rdi) > - add $16, %rdi > - cmp $128, %rdx > - movdqu %xmm0, (%r8) > - ja L(shl_0_gobble) > - cmp $64, %rdx > - jb L(shl_0_less_64bytes) > - movaps (%rsi), %xmm4 > - movaps 16(%rsi), %xmm1 > - movaps 32(%rsi), %xmm2 > - movaps 48(%rsi), %xmm3 > - movaps %xmm4, (%rdi) > - movaps %xmm1, 16(%rdi) > - movaps %xmm2, 32(%rdi) > - movaps %xmm3, 48(%rdi) > - sub $64, %rdx > - add $64, %rsi > - add $64, %rdi > -L(shl_0_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble): > -#ifdef DATA_CACHE_SIZE_HALF > - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %RDX_LP > -#endif > - lea -128(%rdx), %rdx > - jae L(shl_0_gobble_mem_loop) > -L(shl_0_gobble_cache_loop): > - movdqa (%rsi), %xmm4 > - movaps 0x10(%rsi), %xmm1 > - movaps 0x20(%rsi), %xmm2 > - movaps 0x30(%rsi), %xmm3 > - > - movdqa %xmm4, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - > - sub $128, %rdx > - movaps 0x40(%rsi), %xmm4 > - movaps 0x50(%rsi), %xmm5 > - movaps 0x60(%rsi), %xmm6 > - movaps 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - movaps %xmm4, 0x40(%rdi) > - movaps %xmm5, 0x50(%rdi) > - movaps %xmm6, 0x60(%rdi) > - movaps %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_cache_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_cache_less_64bytes) > - > - movdqa (%rsi), %xmm4 > - sub $0x40, %rdx > - movdqa 0x10(%rsi), %xmm1 > - > - movdqa %xmm4, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - > - movdqa 0x20(%rsi), %xmm4 > - movdqa 0x30(%rsi), %xmm1 > - add $0x40, %rsi > - > - movdqa %xmm4, 0x20(%rdi) > - movdqa %xmm1, 0x30(%rdi) > - add $0x40, %rdi > -L(shl_0_cache_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_mem_loop): > - prefetcht0 0x1c0(%rsi) > - prefetcht0 0x280(%rsi) > - > - movdqa (%rsi), %xmm0 > - movdqa 0x10(%rsi), %xmm1 > - movdqa 0x20(%rsi), %xmm2 > - movdqa 0x30(%rsi), %xmm3 > - movdqa 0x40(%rsi), %xmm4 > - movdqa 0x50(%rsi), %xmm5 > - movdqa 0x60(%rsi), %xmm6 > - movdqa 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - sub $0x80, %rdx > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - movdqa %xmm2, 0x20(%rdi) > - movdqa %xmm3, 0x30(%rdi) > - movdqa %xmm4, 0x40(%rdi) > - movdqa %xmm5, 0x50(%rdi) > - movdqa %xmm6, 0x60(%rdi) > - movdqa %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_mem_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_mem_less_64bytes) > - > - movdqa (%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa 0x10(%rsi), %xmm1 > - > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - > - movdqa 0x20(%rsi), %xmm0 > - movdqa 0x30(%rsi), %xmm1 > - add $0x40, %rsi > - > - movdqa %xmm0, 0x20(%rdi) > - movdqa %xmm1, 0x30(%rdi) > - add $0x40, %rdi > -L(shl_0_mem_less_64bytes): > - cmp $0x20, %rdx > - jb L(shl_0_mem_less_32bytes) > - movdqa (%rsi), %xmm0 > - sub $0x20, %rdx > - movdqa 0x10(%rsi), %xmm1 > - add $0x20, %rsi > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - add $0x20, %rdi > -L(shl_0_mem_less_32bytes): > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_bwd): > - sub $16, %rdx > - movdqa -0x10(%rsi), %xmm1 > - sub $16, %rsi > - movdqa %xmm1, -0x10(%rdi) > - sub $16, %rdi > - cmp $0x80, %rdx > - movdqu %xmm0, (%r8) > - ja L(shl_0_gobble_bwd) > - cmp $64, %rdx > - jb L(shl_0_less_64bytes_bwd) > - movaps -0x10(%rsi), %xmm0 > - movaps -0x20(%rsi), %xmm1 > - movaps -0x30(%rsi), %xmm2 > - movaps -0x40(%rsi), %xmm3 > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - sub $64, %rdx > - sub $0x40, %rsi > - sub $0x40, %rdi > -L(shl_0_less_64bytes_bwd): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_bwd): > -#ifdef DATA_CACHE_SIZE_HALF > - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %RDX_LP > -#endif > - lea -128(%rdx), %rdx > - jae L(shl_0_gobble_mem_bwd_loop) > -L(shl_0_gobble_bwd_loop): > - movdqa -0x10(%rsi), %xmm0 > - movaps -0x20(%rsi), %xmm1 > - movaps -0x30(%rsi), %xmm2 > - movaps -0x40(%rsi), %xmm3 > - > - movdqa %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - > - sub $0x80, %rdx > - movaps -0x50(%rsi), %xmm4 > - movaps -0x60(%rsi), %xmm5 > - movaps -0x70(%rsi), %xmm6 > - movaps -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - movaps %xmm4, -0x50(%rdi) > - movaps %xmm5, -0x60(%rdi) > - movaps %xmm6, -0x70(%rdi) > - movaps %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_gobble_bwd_less_64bytes) > - > - movdqa -0x10(%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa -0x20(%rsi), %xmm1 > - > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - > - movdqa -0x30(%rsi), %xmm0 > - movdqa -0x40(%rsi), %xmm1 > - sub $0x40, %rsi > - > - movdqa %xmm0, -0x30(%rdi) > - movdqa %xmm1, -0x40(%rdi) > - sub $0x40, %rdi > -L(shl_0_gobble_bwd_less_64bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_mem_bwd_loop): > - prefetcht0 -0x1c0(%rsi) > - prefetcht0 -0x280(%rsi) > - movdqa -0x10(%rsi), %xmm0 > - movdqa -0x20(%rsi), %xmm1 > - movdqa -0x30(%rsi), %xmm2 > - movdqa -0x40(%rsi), %xmm3 > - movdqa -0x50(%rsi), %xmm4 > - movdqa -0x60(%rsi), %xmm5 > - movdqa -0x70(%rsi), %xmm6 > - movdqa -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - sub $0x80, %rdx > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - movdqa %xmm2, -0x30(%rdi) > - movdqa %xmm3, -0x40(%rdi) > - movdqa %xmm4, -0x50(%rdi) > - movdqa %xmm5, -0x60(%rdi) > - movdqa %xmm6, -0x70(%rdi) > - movdqa %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_mem_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_mem_bwd_less_64bytes) > - > - movdqa -0x10(%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa -0x20(%rsi), %xmm1 > - > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - > - movdqa -0x30(%rsi), %xmm0 > - movdqa -0x40(%rsi), %xmm1 > - sub $0x40, %rsi > - > - movdqa %xmm0, -0x30(%rdi) > - movdqa %xmm1, -0x40(%rdi) > - sub $0x40, %rdi > -L(shl_0_mem_bwd_less_64bytes): > - cmp $0x20, %rdx > - jb L(shl_0_mem_bwd_less_32bytes) > - movdqa -0x10(%rsi), %xmm0 > - sub $0x20, %rdx > - movdqa -0x20(%rsi), %xmm1 > - sub $0x20, %rsi > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - sub $0x20, %rdi > -L(shl_0_mem_bwd_less_32bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_1): > - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x01(%rsi), %xmm1 > - jb L(L1_fwd) > - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 > -L(L1_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_1_loop_L1): > - sub $64, %rdx > - movaps 0x0f(%rsi), %xmm2 > - movaps 0x1f(%rsi), %xmm3 > - movaps 0x2f(%rsi), %xmm4 > - movaps 0x3f(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $1, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $1, %xmm3, %xmm4 > - palignr $1, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $1, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_1_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_1_bwd): > - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x01(%rsi), %xmm1 > - jb L(L1_bwd) > - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 > -L(L1_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_1_bwd_loop_L1): > - movaps -0x11(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x21(%rsi), %xmm3 > - movaps -0x31(%rsi), %xmm4 > - movaps -0x41(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $1, %xmm2, %xmm1 > - palignr $1, %xmm3, %xmm2 > - palignr $1, %xmm4, %xmm3 > - palignr $1, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_1_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_2): > - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x02(%rsi), %xmm1 > - jb L(L2_fwd) > - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 > -L(L2_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_2_loop_L1): > - sub $64, %rdx > - movaps 0x0e(%rsi), %xmm2 > - movaps 0x1e(%rsi), %xmm3 > - movaps 0x2e(%rsi), %xmm4 > - movaps 0x3e(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $2, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $2, %xmm3, %xmm4 > - palignr $2, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $2, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_2_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_2_bwd): > - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x02(%rsi), %xmm1 > - jb L(L2_bwd) > - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 > -L(L2_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_2_bwd_loop_L1): > - movaps -0x12(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x22(%rsi), %xmm3 > - movaps -0x32(%rsi), %xmm4 > - movaps -0x42(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $2, %xmm2, %xmm1 > - palignr $2, %xmm3, %xmm2 > - palignr $2, %xmm4, %xmm3 > - palignr $2, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_2_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_3): > - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x03(%rsi), %xmm1 > - jb L(L3_fwd) > - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 > -L(L3_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_3_loop_L1): > - sub $64, %rdx > - movaps 0x0d(%rsi), %xmm2 > - movaps 0x1d(%rsi), %xmm3 > - movaps 0x2d(%rsi), %xmm4 > - movaps 0x3d(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $3, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $3, %xmm3, %xmm4 > - palignr $3, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $3, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_3_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_3_bwd): > - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x03(%rsi), %xmm1 > - jb L(L3_bwd) > - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 > -L(L3_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_3_bwd_loop_L1): > - movaps -0x13(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x23(%rsi), %xmm3 > - movaps -0x33(%rsi), %xmm4 > - movaps -0x43(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $3, %xmm2, %xmm1 > - palignr $3, %xmm3, %xmm2 > - palignr $3, %xmm4, %xmm3 > - palignr $3, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_3_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_4): > - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x04(%rsi), %xmm1 > - jb L(L4_fwd) > - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 > -L(L4_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_4_loop_L1): > - sub $64, %rdx > - movaps 0x0c(%rsi), %xmm2 > - movaps 0x1c(%rsi), %xmm3 > - movaps 0x2c(%rsi), %xmm4 > - movaps 0x3c(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $4, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $4, %xmm3, %xmm4 > - palignr $4, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $4, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_4_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_4_bwd): > - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x04(%rsi), %xmm1 > - jb L(L4_bwd) > - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 > -L(L4_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_4_bwd_loop_L1): > - movaps -0x14(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x24(%rsi), %xmm3 > - movaps -0x34(%rsi), %xmm4 > - movaps -0x44(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $4, %xmm2, %xmm1 > - palignr $4, %xmm3, %xmm2 > - palignr $4, %xmm4, %xmm3 > - palignr $4, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_4_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_5): > - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x05(%rsi), %xmm1 > - jb L(L5_fwd) > - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 > -L(L5_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_5_loop_L1): > - sub $64, %rdx > - movaps 0x0b(%rsi), %xmm2 > - movaps 0x1b(%rsi), %xmm3 > - movaps 0x2b(%rsi), %xmm4 > - movaps 0x3b(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $5, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $5, %xmm3, %xmm4 > - palignr $5, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $5, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_5_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_5_bwd): > - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x05(%rsi), %xmm1 > - jb L(L5_bwd) > - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 > -L(L5_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_5_bwd_loop_L1): > - movaps -0x15(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x25(%rsi), %xmm3 > - movaps -0x35(%rsi), %xmm4 > - movaps -0x45(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $5, %xmm2, %xmm1 > - palignr $5, %xmm3, %xmm2 > - palignr $5, %xmm4, %xmm3 > - palignr $5, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_5_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_6): > - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x06(%rsi), %xmm1 > - jb L(L6_fwd) > - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 > -L(L6_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_6_loop_L1): > - sub $64, %rdx > - movaps 0x0a(%rsi), %xmm2 > - movaps 0x1a(%rsi), %xmm3 > - movaps 0x2a(%rsi), %xmm4 > - movaps 0x3a(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $6, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $6, %xmm3, %xmm4 > - palignr $6, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $6, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_6_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_6_bwd): > - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x06(%rsi), %xmm1 > - jb L(L6_bwd) > - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 > -L(L6_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_6_bwd_loop_L1): > - movaps -0x16(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x26(%rsi), %xmm3 > - movaps -0x36(%rsi), %xmm4 > - movaps -0x46(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $6, %xmm2, %xmm1 > - palignr $6, %xmm3, %xmm2 > - palignr $6, %xmm4, %xmm3 > - palignr $6, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_6_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_7): > - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x07(%rsi), %xmm1 > - jb L(L7_fwd) > - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 > -L(L7_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_7_loop_L1): > - sub $64, %rdx > - movaps 0x09(%rsi), %xmm2 > - movaps 0x19(%rsi), %xmm3 > - movaps 0x29(%rsi), %xmm4 > - movaps 0x39(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $7, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $7, %xmm3, %xmm4 > - palignr $7, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $7, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_7_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_7_bwd): > - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x07(%rsi), %xmm1 > - jb L(L7_bwd) > - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 > -L(L7_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_7_bwd_loop_L1): > - movaps -0x17(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x27(%rsi), %xmm3 > - movaps -0x37(%rsi), %xmm4 > - movaps -0x47(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $7, %xmm2, %xmm1 > - palignr $7, %xmm3, %xmm2 > - palignr $7, %xmm4, %xmm3 > - palignr $7, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_7_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_8): > - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x08(%rsi), %xmm1 > - jb L(L8_fwd) > - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 > -L(L8_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > -L(shl_8_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_8_loop_L1): > - sub $64, %rdx > - movaps 0x08(%rsi), %xmm2 > - movaps 0x18(%rsi), %xmm3 > - movaps 0x28(%rsi), %xmm4 > - movaps 0x38(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $8, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $8, %xmm3, %xmm4 > - palignr $8, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $8, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_8_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > - .p2align 4 > -L(shl_8_end): > - lea 64(%rdx), %rdx > - movaps %xmm4, -0x20(%rdi) > - add %rdx, %rsi > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_8_bwd): > - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x08(%rsi), %xmm1 > - jb L(L8_bwd) > - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 > -L(L8_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_8_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_8_bwd_loop_L1): > - movaps -0x18(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x28(%rsi), %xmm3 > - movaps -0x38(%rsi), %xmm4 > - movaps -0x48(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $8, %xmm2, %xmm1 > - palignr $8, %xmm3, %xmm2 > - palignr $8, %xmm4, %xmm3 > - palignr $8, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_8_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_8_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_9): > - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x09(%rsi), %xmm1 > - jb L(L9_fwd) > - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 > -L(L9_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_9_loop_L1): > - sub $64, %rdx > - movaps 0x07(%rsi), %xmm2 > - movaps 0x17(%rsi), %xmm3 > - movaps 0x27(%rsi), %xmm4 > - movaps 0x37(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $9, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $9, %xmm3, %xmm4 > - palignr $9, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $9, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_9_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_9_bwd): > - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x09(%rsi), %xmm1 > - jb L(L9_bwd) > - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 > -L(L9_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_9_bwd_loop_L1): > - movaps -0x19(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x29(%rsi), %xmm3 > - movaps -0x39(%rsi), %xmm4 > - movaps -0x49(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $9, %xmm2, %xmm1 > - palignr $9, %xmm3, %xmm2 > - palignr $9, %xmm4, %xmm3 > - palignr $9, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_9_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_10): > - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0a(%rsi), %xmm1 > - jb L(L10_fwd) > - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 > -L(L10_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_10_loop_L1): > - sub $64, %rdx > - movaps 0x06(%rsi), %xmm2 > - movaps 0x16(%rsi), %xmm3 > - movaps 0x26(%rsi), %xmm4 > - movaps 0x36(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $10, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $10, %xmm3, %xmm4 > - palignr $10, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $10, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_10_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_10_bwd): > - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0a(%rsi), %xmm1 > - jb L(L10_bwd) > - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 > -L(L10_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_10_bwd_loop_L1): > - movaps -0x1a(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2a(%rsi), %xmm3 > - movaps -0x3a(%rsi), %xmm4 > - movaps -0x4a(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $10, %xmm2, %xmm1 > - palignr $10, %xmm3, %xmm2 > - palignr $10, %xmm4, %xmm3 > - palignr $10, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_10_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_11): > - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0b(%rsi), %xmm1 > - jb L(L11_fwd) > - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 > -L(L11_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_11_loop_L1): > - sub $64, %rdx > - movaps 0x05(%rsi), %xmm2 > - movaps 0x15(%rsi), %xmm3 > - movaps 0x25(%rsi), %xmm4 > - movaps 0x35(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $11, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $11, %xmm3, %xmm4 > - palignr $11, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $11, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_11_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_11_bwd): > - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0b(%rsi), %xmm1 > - jb L(L11_bwd) > - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 > -L(L11_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_11_bwd_loop_L1): > - movaps -0x1b(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2b(%rsi), %xmm3 > - movaps -0x3b(%rsi), %xmm4 > - movaps -0x4b(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $11, %xmm2, %xmm1 > - palignr $11, %xmm3, %xmm2 > - palignr $11, %xmm4, %xmm3 > - palignr $11, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_11_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_12): > - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0c(%rsi), %xmm1 > - jb L(L12_fwd) > - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 > -L(L12_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_12_loop_L1): > - sub $64, %rdx > - movaps 0x04(%rsi), %xmm2 > - movaps 0x14(%rsi), %xmm3 > - movaps 0x24(%rsi), %xmm4 > - movaps 0x34(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $12, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $12, %xmm3, %xmm4 > - palignr $12, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $12, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_12_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_12_bwd): > - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0c(%rsi), %xmm1 > - jb L(L12_bwd) > - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 > -L(L12_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_12_bwd_loop_L1): > - movaps -0x1c(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2c(%rsi), %xmm3 > - movaps -0x3c(%rsi), %xmm4 > - movaps -0x4c(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $12, %xmm2, %xmm1 > - palignr $12, %xmm3, %xmm2 > - palignr $12, %xmm4, %xmm3 > - palignr $12, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_12_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_13): > - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0d(%rsi), %xmm1 > - jb L(L13_fwd) > - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 > -L(L13_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_13_loop_L1): > - sub $64, %rdx > - movaps 0x03(%rsi), %xmm2 > - movaps 0x13(%rsi), %xmm3 > - movaps 0x23(%rsi), %xmm4 > - movaps 0x33(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $13, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $13, %xmm3, %xmm4 > - palignr $13, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $13, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_13_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_13_bwd): > - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0d(%rsi), %xmm1 > - jb L(L13_bwd) > - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 > -L(L13_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_13_bwd_loop_L1): > - movaps -0x1d(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2d(%rsi), %xmm3 > - movaps -0x3d(%rsi), %xmm4 > - movaps -0x4d(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $13, %xmm2, %xmm1 > - palignr $13, %xmm3, %xmm2 > - palignr $13, %xmm4, %xmm3 > - palignr $13, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_13_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_14): > - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0e(%rsi), %xmm1 > - jb L(L14_fwd) > - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 > -L(L14_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_14_loop_L1): > - sub $64, %rdx > - movaps 0x02(%rsi), %xmm2 > - movaps 0x12(%rsi), %xmm3 > - movaps 0x22(%rsi), %xmm4 > - movaps 0x32(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $14, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $14, %xmm3, %xmm4 > - palignr $14, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $14, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_14_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_14_bwd): > - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0e(%rsi), %xmm1 > - jb L(L14_bwd) > - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 > -L(L14_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_14_bwd_loop_L1): > - movaps -0x1e(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2e(%rsi), %xmm3 > - movaps -0x3e(%rsi), %xmm4 > - movaps -0x4e(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $14, %xmm2, %xmm1 > - palignr $14, %xmm3, %xmm2 > - palignr $14, %xmm4, %xmm3 > - palignr $14, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_14_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_15): > - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0f(%rsi), %xmm1 > - jb L(L15_fwd) > - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 > -L(L15_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_15_loop_L1): > - sub $64, %rdx > - movaps 0x01(%rsi), %xmm2 > - movaps 0x11(%rsi), %xmm3 > - movaps 0x21(%rsi), %xmm4 > - movaps 0x31(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $15, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $15, %xmm3, %xmm4 > - palignr $15, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $15, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_15_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_15_bwd): > - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0f(%rsi), %xmm1 > - jb L(L15_bwd) > - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 > -L(L15_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_15_bwd_loop_L1): > - movaps -0x1f(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2f(%rsi), %xmm3 > - movaps -0x3f(%rsi), %xmm4 > - movaps -0x4f(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $15, %xmm2, %xmm1 > - palignr $15, %xmm3, %xmm2 > - palignr $15, %xmm4, %xmm3 > - palignr $15, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_15_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(write_72bytes): > - movdqu -72(%rsi), %xmm0 > - movdqu -56(%rsi), %xmm1 > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rcx > - movdqu %xmm0, -72(%rdi) > - movdqu %xmm1, -56(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_64bytes): > - movdqu -64(%rsi), %xmm0 > - mov -48(%rsi), %rcx > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - movdqu %xmm0, -64(%rdi) > - mov %rcx, -48(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_56bytes): > - movdqu -56(%rsi), %xmm0 > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rcx > - movdqu %xmm0, -56(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_48bytes): > - mov -48(%rsi), %rcx > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %rcx, -48(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_40bytes): > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_32bytes): > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_24bytes): > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_16bytes): > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_8bytes): > - mov -8(%rsi), %rdx > - mov %rdx, -8(%rdi) > -L(write_0bytes): > - ret > - > - .p2align 4 > -L(write_73bytes): > - movdqu -73(%rsi), %xmm0 > - movdqu -57(%rsi), %xmm1 > - mov -41(%rsi), %rcx > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %r8 > - mov -4(%rsi), %edx > - movdqu %xmm0, -73(%rdi) > - movdqu %xmm1, -57(%rdi) > - mov %rcx, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %r8, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_65bytes): > - movdqu -65(%rsi), %xmm0 > - movdqu -49(%rsi), %xmm1 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -65(%rdi) > - movdqu %xmm1, -49(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_57bytes): > - movdqu -57(%rsi), %xmm0 > - mov -41(%rsi), %r8 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -57(%rdi) > - mov %r8, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_49bytes): > - movdqu -49(%rsi), %xmm0 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -49(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_41bytes): > - mov -41(%rsi), %r8 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r8, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_33bytes): > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_25bytes): > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_17bytes): > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_9bytes): > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_1bytes): > - mov -1(%rsi), %dl > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_74bytes): > - movdqu -74(%rsi), %xmm0 > - movdqu -58(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -74(%rdi) > - movdqu %xmm1, -58(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_66bytes): > - movdqu -66(%rsi), %xmm0 > - movdqu -50(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -66(%rdi) > - movdqu %xmm1, -50(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_58bytes): > - movdqu -58(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm1, -58(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_50bytes): > - movdqu -50(%rsi), %xmm0 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -50(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_42bytes): > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_34bytes): > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_26bytes): > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_18bytes): > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_10bytes): > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_2bytes): > - mov -2(%rsi), %dx > - mov %dx, -2(%rdi) > - ret > - > - .p2align 4 > -L(write_75bytes): > - movdqu -75(%rsi), %xmm0 > - movdqu -59(%rsi), %xmm1 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -75(%rdi) > - movdqu %xmm1, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_67bytes): > - movdqu -67(%rsi), %xmm0 > - movdqu -59(%rsi), %xmm1 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -67(%rdi) > - movdqu %xmm1, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_59bytes): > - movdqu -59(%rsi), %xmm0 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_51bytes): > - movdqu -51(%rsi), %xmm0 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -51(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_43bytes): > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_35bytes): > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_27bytes): > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_19bytes): > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_11bytes): > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_3bytes): > - mov -3(%rsi), %dx > - mov -2(%rsi), %cx > - mov %dx, -3(%rdi) > - mov %cx, -2(%rdi) > - ret > - > - .p2align 4 > -L(write_76bytes): > - movdqu -76(%rsi), %xmm0 > - movdqu -60(%rsi), %xmm1 > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -76(%rdi) > - movdqu %xmm1, -60(%rdi) > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_68bytes): > - movdqu -68(%rsi), %xmm0 > - movdqu -52(%rsi), %xmm1 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -68(%rdi) > - movdqu %xmm1, -52(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_60bytes): > - movdqu -60(%rsi), %xmm0 > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -60(%rdi) > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_52bytes): > - movdqu -52(%rsi), %xmm0 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -52(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_44bytes): > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_36bytes): > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_28bytes): > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_20bytes): > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_12bytes): > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_4bytes): > - mov -4(%rsi), %edx > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_77bytes): > - movdqu -77(%rsi), %xmm0 > - movdqu -61(%rsi), %xmm1 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -77(%rdi) > - movdqu %xmm1, -61(%rdi) > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_69bytes): > - movdqu -69(%rsi), %xmm0 > - movdqu -53(%rsi), %xmm1 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -69(%rdi) > - movdqu %xmm1, -53(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_61bytes): > - movdqu -61(%rsi), %xmm0 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -61(%rdi) > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_53bytes): > - movdqu -53(%rsi), %xmm0 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -53(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_45bytes): > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_37bytes): > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_29bytes): > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_21bytes): > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_13bytes): > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_5bytes): > - mov -5(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -5(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_78bytes): > - movdqu -78(%rsi), %xmm0 > - movdqu -62(%rsi), %xmm1 > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -78(%rdi) > - movdqu %xmm1, -62(%rdi) > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_70bytes): > - movdqu -70(%rsi), %xmm0 > - movdqu -54(%rsi), %xmm1 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -70(%rdi) > - movdqu %xmm1, -54(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_62bytes): > - movdqu -62(%rsi), %xmm0 > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -62(%rdi) > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_54bytes): > - movdqu -54(%rsi), %xmm0 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -54(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_46bytes): > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_38bytes): > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_30bytes): > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_22bytes): > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_14bytes): > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_6bytes): > - mov -6(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -6(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_79bytes): > - movdqu -79(%rsi), %xmm0 > - movdqu -63(%rsi), %xmm1 > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -79(%rdi) > - movdqu %xmm1, -63(%rdi) > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_71bytes): > - movdqu -71(%rsi), %xmm0 > - movdqu -55(%rsi), %xmm1 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -71(%rdi) > - movdqu %xmm1, -55(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_63bytes): > - movdqu -63(%rsi), %xmm0 > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -63(%rdi) > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_55bytes): > - movdqu -55(%rsi), %xmm0 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -55(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_47bytes): > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_39bytes): > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_31bytes): > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_23bytes): > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_15bytes): > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_7bytes): > - mov -7(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -7(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(large_page_fwd): > - movdqu (%rsi), %xmm1 > - lea 16(%rsi), %rsi > - movdqu %xmm0, (%r8) > - movntdq %xmm1, (%rdi) > - lea 16(%rdi), %rdi > - lea -0x90(%rdx), %rdx > -#ifdef USE_AS_MEMMOVE > - mov %rsi, %r9 > - sub %rdi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_fwd) > - shl $2, %rcx > - cmp %rcx, %rdx > - jb L(ll_cache_copy_fwd_start) > -L(memmove_is_memcpy_fwd): > -#endif > -L(large_page_loop): > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - movntdq %xmm4, 0x40(%rdi) > - movntdq %xmm5, 0x50(%rdi) > - movntdq %xmm6, 0x60(%rdi) > - movntdq %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - jae L(large_page_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_less_64bytes) > - > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - lea 0x40(%rsi), %rsi > - > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - lea 0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - sfence > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#ifdef USE_AS_MEMMOVE > - .p2align 4 > -L(ll_cache_copy_fwd_start): > - prefetcht0 0x1c0(%rsi) > - prefetcht0 0x200(%rsi) > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movaps %xmm0, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - movaps %xmm4, 0x40(%rdi) > - movaps %xmm5, 0x50(%rdi) > - movaps %xmm6, 0x60(%rdi) > - movaps %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - jae L(ll_cache_copy_fwd_start) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_ll_less_fwd_64bytes) > - > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - lea 0x40(%rsi), %rsi > - > - movaps %xmm0, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - lea 0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_ll_less_fwd_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#endif > - .p2align 4 > -L(large_page_bwd): > - movdqu -0x10(%rsi), %xmm1 > - lea -16(%rsi), %rsi > - movdqu %xmm0, (%r8) > - movdqa %xmm1, -0x10(%rdi) > - lea -16(%rdi), %rdi > - lea -0x90(%rdx), %rdx > -#ifdef USE_AS_MEMMOVE > - mov %rdi, %r9 > - sub %rsi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_bwd) > - cmp %rcx, %r9 > - jb L(ll_cache_copy_bwd_start) > -L(memmove_is_memcpy_bwd): > -#endif > -L(large_page_bwd_loop): > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - movdqu -0x50(%rsi), %xmm4 > - movdqu -0x60(%rsi), %xmm5 > - movdqu -0x70(%rsi), %xmm6 > - movdqu -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movntdq %xmm0, -0x10(%rdi) > - movntdq %xmm1, -0x20(%rdi) > - movntdq %xmm2, -0x30(%rdi) > - movntdq %xmm3, -0x40(%rdi) > - movntdq %xmm4, -0x50(%rdi) > - movntdq %xmm5, -0x60(%rdi) > - movntdq %xmm6, -0x70(%rdi) > - movntdq %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - jae L(large_page_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_less_bwd_64bytes) > - > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - lea -0x40(%rsi), %rsi > - > - movntdq %xmm0, -0x10(%rdi) > - movntdq %xmm1, -0x20(%rdi) > - movntdq %xmm2, -0x30(%rdi) > - movntdq %xmm3, -0x40(%rdi) > - lea -0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_less_bwd_64bytes): > - sfence > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#ifdef USE_AS_MEMMOVE > - .p2align 4 > -L(ll_cache_copy_bwd_start): > - prefetcht0 -0x1c0(%rsi) > - prefetcht0 -0x200(%rsi) > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - movdqu -0x50(%rsi), %xmm4 > - movdqu -0x60(%rsi), %xmm5 > - movdqu -0x70(%rsi), %xmm6 > - movdqu -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - movaps %xmm4, -0x50(%rdi) > - movaps %xmm5, -0x60(%rdi) > - movaps %xmm6, -0x70(%rdi) > - movaps %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - jae L(ll_cache_copy_bwd_start) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_ll_less_bwd_64bytes) > - > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - lea -0x40(%rsi), %rsi > - > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - lea -0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_ll_less_bwd_64bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > -#endif > - > -END (MEMCPY) > - > - .section .rodata.ssse3,"a",@progbits > - .p2align 3 > -L(table_less_80bytes): > - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) > - > - .p2align 3 > -L(shl_table): > - .int JMPTBL (L(shl_0), L(shl_table)) > - .int JMPTBL (L(shl_1), L(shl_table)) > - .int JMPTBL (L(shl_2), L(shl_table)) > - .int JMPTBL (L(shl_3), L(shl_table)) > - .int JMPTBL (L(shl_4), L(shl_table)) > - .int JMPTBL (L(shl_5), L(shl_table)) > - .int JMPTBL (L(shl_6), L(shl_table)) > - .int JMPTBL (L(shl_7), L(shl_table)) > - .int JMPTBL (L(shl_8), L(shl_table)) > - .int JMPTBL (L(shl_9), L(shl_table)) > - .int JMPTBL (L(shl_10), L(shl_table)) > - .int JMPTBL (L(shl_11), L(shl_table)) > - .int JMPTBL (L(shl_12), L(shl_table)) > - .int JMPTBL (L(shl_13), L(shl_table)) > - .int JMPTBL (L(shl_14), L(shl_table)) > - .int JMPTBL (L(shl_15), L(shl_table)) > - > - .p2align 3 > -L(shl_table_bwd): > - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) > - > -#endif > diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S > index 295430b1ef..84e4e0f6cb 100644 > --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S > +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S > @@ -1,4 +1,382 @@ > -#define USE_AS_MEMMOVE > -#define MEMCPY __memmove_ssse3 > -#define MEMCPY_CHK __memmove_chk_ssse3 > -#include "memcpy-ssse3.S" > +#include <sysdep.h> > + > +#ifndef MEMMOVE > +# define MEMMOVE __memmove_ssse3 > +# define MEMMOVE_CHK __memmove_chk_ssse3 > +# define MEMCPY __memcpy_ssse3 > +# define MEMCPY_CHK __memcpy_chk_ssse3 > +# define MEMPCPY __mempcpy_ssse3 > +# define MEMPCPY_CHK __mempcpy_chk_ssse3 > +#endif > + > + .section .text.ssse3, "ax", @progbits > +ENTRY(MEMPCPY_CHK) > + cmp %RDX_LP, %RCX_LP > + jb HIDDEN_JUMPTARGET(__chk_fail) > +END(MEMPCPY_CHK) > + > +ENTRY(MEMPCPY) > + mov %RDI_LP, %RAX_LP > + add %RDX_LP, %RAX_LP > + jmp L(start) > +END(MEMPCPY) > + > +ENTRY(MEMMOVE_CHK) > + cmp %RDX_LP, %RCX_LP > + jb HIDDEN_JUMPTARGET(__chk_fail) > +END(MEMMOVE_CHK) > + > +ENTRY_P2ALIGN(MEMMOVE, 6) > + movq %rdi, %rax > +L(start): > + cmpq $16, %rdx > + jb L(copy_0_15) > + > + /* These loads are always useful. */ > + movups 0(%rsi), %xmm0 > + movups -16(%rsi, %rdx), %xmm7 > + cmpq $32, %rdx > + ja L(more_2x_vec) > + > + movups %xmm0, 0(%rdi) > + movups %xmm7, -16(%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > +L(copy_4x_vec): > + movups 16(%rsi), %xmm1 > + movups -32(%rsi, %rdx), %xmm2 > + > + movups %xmm0, 0(%rdi) > + movups %xmm1, 16(%rdi) > + movups %xmm2, -32(%rdi, %rdx) > + movups %xmm7, -16(%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > +L(copy_0_15): > + cmpl $8, %edx > + ja L(copy_9_15) > + > + cmpl $4, %edx > + jb L(copy_0_3) > + > + movl 0(%rsi), %ecx > + movl -4(%rsi, %rdx), %esi > + movl %ecx, 0(%rdi) > + movl %esi, -4(%rdi, %rdx) > + ret > + > + .p2align 4,, 8 > +L(copy_9_15): > + movq 0(%rsi), %rcx > + movq -8(%rsi, %rdx), %rsi > + movq %rcx, 0(%rdi) > + movq %rsi, -8(%rdi, %rdx) > + ret > + > + .p2align 4,, 4 > +L(copy_0_3): > + cmpl $1, %edx > + jl L(copy_0_0) > + movzbl (%rsi), %ecx > + je L(copy_0_1) > + > + movzwl -2(%rsi, %rdx), %esi > + movw %si, -2(%rdi, %rdx) > +L(copy_0_1): > + movb %cl, (%rdi) > +L(copy_0_0): > +L(nop): > + ret > + > + .p2align 4 > +L(more_2x_vec): > + cmpq $64, %rdx > + jbe L(copy_4x_vec) > + > + /* We use rcx later to get alignr value. */ > + movq %rdi, %rcx > + > + /* Backward copy for overlap + dst > src for memmove safety. */ > + subq %rsi, %rcx > + cmpq %rdx, %rcx > + jb L(copy_backward) > + > + /* Load tail. */ > + > + /* -16(%rsi, %rdx) already loaded into xmm7. */ > + movups -32(%rsi, %rdx), %xmm8 > + movups -48(%rsi, %rdx), %xmm9 > + > + /* Get misalignment. */ > + andl $0xf, %ecx > + > + movq %rsi, %r9 > + addq %rcx, %rsi > + andq $-16, %rsi > + /* Get first vec for `palignr`. */ > + movaps (%rsi), %xmm1 > + > + /* We have loaded (%rsi) so safe to do this store before the > + loop. */ > + movups %xmm0, (%rdi) > + > +#ifdef SHARED_CACHE_SIZE_HALF > + cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP > +#else > + cmp __x86_shared_cache_size_half(%rip), %rdx > +#endif > + ja L(large_memcpy) > + > + leaq -64(%rdi, %rdx), %r8 > + andq $-16, %rdi > + movl $48, %edx > + > + leaq L(loop_fwd_start)(%rip), %r9 > + sall $6, %ecx > + addq %r9, %rcx > + jmp * %rcx > + > + .p2align 4,, 8 > +L(copy_backward): > + testq %rcx, %rcx > + jz L(nop) > + > + /* Preload tail. */ > + > + /* (%rsi) already loaded into xmm0. */ > + movups 16(%rsi), %xmm4 > + movups 32(%rsi), %xmm5 > + > + movq %rdi, %r8 > + subq %rdi, %rsi > + leaq -49(%rdi, %rdx), %rdi > + andq $-16, %rdi > + addq %rdi, %rsi > + andq $-16, %rsi > + > + movaps 48(%rsi), %xmm6 > + > + > + leaq L(loop_bkwd_start)(%rip), %r9 > + andl $0xf, %ecx > + sall $6, %ecx > + addq %r9, %rcx > + jmp * %rcx > + > + .p2align 4,, 8 > +L(large_memcpy): > + movups -64(%r9, %rdx), %xmm10 > + movups -80(%r9, %rdx), %xmm11 > + > + sall $5, %ecx > + leal (%rcx, %rcx, 2), %r8d > + leaq -96(%rdi, %rdx), %rcx > + andq $-16, %rdi > + leaq L(large_loop_fwd_start)(%rip), %rdx > + addq %r8, %rdx > + jmp * %rdx > + > + > + /* Instead of a typical jump table all 16 loops are exactly > + 64-bytes in size. So, we can just jump to first loop + r8 * > + 64. Before modifying any loop ensure all their sizes match! > + */ > + .p2align 6 > +L(loop_fwd_start): > +L(loop_fwd_0x0): > + movaps 16(%rsi), %xmm1 > + movaps 32(%rsi), %xmm2 > + movaps 48(%rsi), %xmm3 > + movaps %xmm1, 16(%rdi) > + movaps %xmm2, 32(%rdi) > + movaps %xmm3, 48(%rdi) > + addq %rdx, %rdi > + addq %rdx, %rsi > + cmpq %rdi, %r8 > + ja L(loop_fwd_0x0) > +L(end_loop_fwd): > + movups %xmm9, 16(%r8) > + movups %xmm8, 32(%r8) > + movups %xmm7, 48(%r8) > + ret > + > + /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. > + 60 bytes otherwise. */ > +#define ALIGNED_LOOP_FWD(align_by); \ > + .p2align 6; \ > +L(loop_fwd_ ## align_by): \ > + movaps 16(%rsi), %xmm0; \ > + movaps 32(%rsi), %xmm2; \ > + movaps 48(%rsi), %xmm3; \ > + movaps %xmm3, %xmm4; \ > + palignr $align_by, %xmm2, %xmm3; \ > + palignr $align_by, %xmm0, %xmm2; \ > + palignr $align_by, %xmm1, %xmm0; \ > + movaps %xmm4, %xmm1; \ > + movaps %xmm0, 16(%rdi); \ > + movaps %xmm2, 32(%rdi); \ > + movaps %xmm3, 48(%rdi); \ > + addq %rdx, %rdi; \ > + addq %rdx, %rsi; \ > + cmpq %rdi, %r8; \ > + ja L(loop_fwd_ ## align_by); \ > + jmp L(end_loop_fwd); > + > + /* Must be in descending order. */ > + ALIGNED_LOOP_FWD (0xf) > + ALIGNED_LOOP_FWD (0xe) > + ALIGNED_LOOP_FWD (0xd) > + ALIGNED_LOOP_FWD (0xc) > + ALIGNED_LOOP_FWD (0xb) > + ALIGNED_LOOP_FWD (0xa) > + ALIGNED_LOOP_FWD (0x9) > + ALIGNED_LOOP_FWD (0x8) > + ALIGNED_LOOP_FWD (0x7) > + ALIGNED_LOOP_FWD (0x6) > + ALIGNED_LOOP_FWD (0x5) > + ALIGNED_LOOP_FWD (0x4) > + ALIGNED_LOOP_FWD (0x3) > + ALIGNED_LOOP_FWD (0x2) > + ALIGNED_LOOP_FWD (0x1) > + > + .p2align 6 > +L(large_loop_fwd_start): > +L(large_loop_fwd_0x0): > + movaps 16(%rsi), %xmm1 > + movaps 32(%rsi), %xmm2 > + movaps 48(%rsi), %xmm3 > + movaps 64(%rsi), %xmm4 > + movaps 80(%rsi), %xmm5 > + movntps %xmm1, 16(%rdi) > + movntps %xmm2, 32(%rdi) > + movntps %xmm3, 48(%rdi) > + movntps %xmm4, 64(%rdi) > + movntps %xmm5, 80(%rdi) > + addq $80, %rdi > + addq $80, %rsi > + cmpq %rdi, %rcx > + ja L(large_loop_fwd_0x0) > + > + /* Ensure no icache line split on tail. */ > + .p2align 4 > +L(end_large_loop_fwd): > + sfence > + movups %xmm11, 16(%rcx) > + movups %xmm10, 32(%rcx) > + movups %xmm9, 48(%rcx) > + movups %xmm8, 64(%rcx) > + movups %xmm7, 80(%rcx) > + ret > + > + > + /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure > + 96-byte spacing between each. */ > +#define ALIGNED_LARGE_LOOP_FWD(align_by); \ > + .p2align 5; \ > +L(large_loop_fwd_ ## align_by): \ > + movaps 16(%rsi), %xmm0; \ > + movaps 32(%rsi), %xmm2; \ > + movaps 48(%rsi), %xmm3; \ > + movaps 64(%rsi), %xmm4; \ > + movaps 80(%rsi), %xmm5; \ > + movaps %xmm5, %xmm6; \ > + palignr $align_by, %xmm4, %xmm5; \ > + palignr $align_by, %xmm3, %xmm4; \ > + palignr $align_by, %xmm2, %xmm3; \ > + palignr $align_by, %xmm0, %xmm2; \ > + palignr $align_by, %xmm1, %xmm0; \ > + movaps %xmm6, %xmm1; \ > + movntps %xmm0, 16(%rdi); \ > + movntps %xmm2, 32(%rdi); \ > + movntps %xmm3, 48(%rdi); \ > + movntps %xmm4, 64(%rdi); \ > + movntps %xmm5, 80(%rdi); \ > + addq $80, %rdi; \ > + addq $80, %rsi; \ > + cmpq %rdi, %rcx; \ > + ja L(large_loop_fwd_ ## align_by); \ > + jmp L(end_large_loop_fwd); > + > + /* Must be in descending order. */ > + ALIGNED_LARGE_LOOP_FWD (0xf) > + ALIGNED_LARGE_LOOP_FWD (0xe) > + ALIGNED_LARGE_LOOP_FWD (0xd) > + ALIGNED_LARGE_LOOP_FWD (0xc) > + ALIGNED_LARGE_LOOP_FWD (0xb) > + ALIGNED_LARGE_LOOP_FWD (0xa) > + ALIGNED_LARGE_LOOP_FWD (0x9) > + ALIGNED_LARGE_LOOP_FWD (0x8) > + ALIGNED_LARGE_LOOP_FWD (0x7) > + ALIGNED_LARGE_LOOP_FWD (0x6) > + ALIGNED_LARGE_LOOP_FWD (0x5) > + ALIGNED_LARGE_LOOP_FWD (0x4) > + ALIGNED_LARGE_LOOP_FWD (0x3) > + ALIGNED_LARGE_LOOP_FWD (0x2) > + ALIGNED_LARGE_LOOP_FWD (0x1) > + > + > + .p2align 6 > +L(loop_bkwd_start): > +L(loop_bkwd_0x0): > + movaps 32(%rsi), %xmm1 > + movaps 16(%rsi), %xmm2 > + movaps 0(%rsi), %xmm3 > + movaps %xmm1, 32(%rdi) > + movaps %xmm2, 16(%rdi) > + movaps %xmm3, 0(%rdi) > + subq $48, %rdi > + subq $48, %rsi > + cmpq %rdi, %r8 > + jb L(loop_bkwd_0x0) > +L(end_loop_bkwd): > + movups %xmm7, -16(%r8, %rdx) > + movups %xmm0, 0(%r8) > + movups %xmm4, 16(%r8) > + movups %xmm5, 32(%r8) > + > + ret > + > + > + /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. > + 60 bytes otherwise. */ > +#define ALIGNED_LOOP_BKWD(align_by); \ > + .p2align 6; \ > +L(loop_bkwd_ ## align_by): \ > + movaps 32(%rsi), %xmm1; \ > + movaps 16(%rsi), %xmm2; \ > + movaps 0(%rsi), %xmm3; \ > + palignr $align_by, %xmm1, %xmm6; \ > + palignr $align_by, %xmm2, %xmm1; \ > + palignr $align_by, %xmm3, %xmm2; \ > + movaps %xmm6, 32(%rdi); \ > + movaps %xmm1, 16(%rdi); \ > + movaps %xmm2, 0(%rdi); \ > + subq $48, %rdi; \ > + subq $48, %rsi; \ > + movaps %xmm3, %xmm6; \ > + cmpq %rdi, %r8; \ > + jb L(loop_bkwd_ ## align_by); \ > + jmp L(end_loop_bkwd); > + > + /* Must be in descending order. */ > + ALIGNED_LOOP_BKWD (0xf) > + ALIGNED_LOOP_BKWD (0xe) > + ALIGNED_LOOP_BKWD (0xd) > + ALIGNED_LOOP_BKWD (0xc) > + ALIGNED_LOOP_BKWD (0xb) > + ALIGNED_LOOP_BKWD (0xa) > + ALIGNED_LOOP_BKWD (0x9) > + ALIGNED_LOOP_BKWD (0x8) > + ALIGNED_LOOP_BKWD (0x7) > + ALIGNED_LOOP_BKWD (0x6) > + ALIGNED_LOOP_BKWD (0x5) > + ALIGNED_LOOP_BKWD (0x4) > + ALIGNED_LOOP_BKWD (0x3) > + ALIGNED_LOOP_BKWD (0x2) > + ALIGNED_LOOP_BKWD (0x1) > +END(MEMMOVE) > + > +strong_alias (MEMMOVE, MEMCPY) > +strong_alias (MEMMOVE_CHK, MEMCPY_CHK) > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (4 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein ` (3 subsequent siblings) 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha New code save size (-303 bytes) and has significantly better performance. geometric_mean(N=20) of page cross cases New / Original: 0.634 --- sysdeps/x86_64/memcmp.S | 884 ++++++++++++++--------- sysdeps/x86_64/memcmpeq.S | 2 +- sysdeps/x86_64/multiarch/Makefile | 2 +- sysdeps/x86_64/multiarch/memcmp-sse2.S | 4 +- sysdeps/x86_64/multiarch/memcmpeq-sse2.S | 4 +- sysdeps/x86_64/multiarch/wmemcmp-c.c | 9 - sysdeps/x86_64/multiarch/wmemcmp-sse2.S | 25 + sysdeps/x86_64/wmemcmp.S | 21 + 8 files changed, 575 insertions(+), 376 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S create mode 100644 sysdeps/x86_64/wmemcmp.S diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S index e02a53ea1e..b153694048 100644 --- a/sysdeps/x86_64/memcmp.S +++ b/sysdeps/x86_64/memcmp.S @@ -18,395 +18,557 @@ #include <sysdep.h> - .text -ENTRY (memcmp) -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - movl %edx, %edx +#ifdef USE_AS_WMEMCMP +# define PCMPEQ pcmpeqd +# define CHAR_SIZE 4 +# define SIZE_OFFSET (0) +#else +# define PCMPEQ pcmpeqb +# define CHAR_SIZE 1 #endif - test %RDX_LP, %RDX_LP - jz L(finz) - cmpq $1, %rdx - jbe L(finr1b) - subq %rdi, %rsi - movq %rdx, %r10 - cmpq $32, %r10 - jae L(gt32) - /* Handle small chunks and last block of less than 32 bytes. */ -L(small): - testq $1, %r10 - jz L(s2b) - movzbl (%rdi), %eax - movzbl (%rdi, %rsi), %edx - subq $1, %r10 - je L(finz1) - addq $1, %rdi - subl %edx, %eax - jnz L(exit) -L(s2b): - testq $2, %r10 - jz L(s4b) - movzwl (%rdi), %eax - movzwl (%rdi, %rsi), %edx - subq $2, %r10 + #ifdef USE_AS_MEMCMPEQ - je L(finz1) +# define SIZE_OFFSET (0) +# define CHECK_CMP(x, y) subl x, y #else - je L(fin2_7) +# ifndef SIZE_OFFSET +# define SIZE_OFFSET (CHAR_PER_VEC * 2) +# endif +# define CHECK_CMP(x, y) cmpl x, y #endif - addq $2, %rdi - cmpl %edx, %eax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + +#define VEC_SIZE 16 +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + +#ifndef MEMCMP +# define MEMCMP memcmp +#endif + + .text +ENTRY(MEMCMP) +#ifdef USE_AS_WMEMCMP + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store + in ecx for code size. This is preferable to using `incw` as + it avoids partial register stalls on older hardware (pre + SnB). */ + movl $0xffff, %ecx +#endif + cmpq $CHAR_PER_VEC, %rdx + ja L(more_1x_vec) + +#ifdef USE_AS_WMEMCMP + /* saves a byte of code keeping the fall through path n = [2, 4] + in the initial cache line. */ + decl %edx + jle L(cmp_0_1) + + movq (%rsi), %xmm0 + movq (%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_0) + + movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 + movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_end_0_adj) #else - jnz L(fin2_7) + cmpl $8, %edx + ja L(cmp_9_16) + + cmpl $4, %edx + jb L(cmp_0_3) + +# ifdef USE_AS_MEMCMPEQ + movl (%rsi), %eax + subl (%rdi), %eax + + movl -4(%rsi, %rdx), %esi + subl -4(%rdi, %rdx), %esi + + orl %esi, %eax + ret +# else + /* Combine comparisons for lo and hi 4-byte comparisons. */ + movl -4(%rsi, %rdx), %ecx + movl -4(%rdi, %rdx), %eax + shlq $32, %rcx + shlq $32, %rax + movl (%rsi), %esi + movl (%rdi), %edi + orq %rsi, %rcx + orq %rdi, %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + xorl %eax, %eax + ret +# endif + + .p2align 4,, 10 +L(cmp_9_16): +# ifdef USE_AS_MEMCMPEQ + movq (%rsi), %rax + subq (%rdi), %rax + + movq -8(%rsi, %rdx), %rcx + subq -8(%rdi, %rdx), %rcx + orq %rcx, %rax + /* Convert 64 bit -> 32 bit boolean (we should have made the ABI + return long). */ + setnz %cl + movzbl %cl, %eax +# else + movq (%rsi), %rcx + movq (%rdi), %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + + movq -8(%rsi, %rdx, CHAR_SIZE), %rcx + movq -8(%rdi, %rdx, CHAR_SIZE), %rax + /* Only compute proper return if not-equal. */ + cmpq %rcx, %rax + jnz L(ret_nonzero) + xorl %eax, %eax +# endif #endif -L(s4b): - testq $4, %r10 - jz L(s8b) - movl (%rdi), %eax - movl (%rdi, %rsi), %edx - subq $4, %r10 -#ifdef USE_AS_MEMCMPEQ - je L(finz1) + ret + + .p2align 4,, 8 +L(cmp_0_1): + /* Flag set by earlier comparison against 1. */ + jne L(cmp_0_0) +#ifdef USE_AS_WMEMCMP + movl (%rdi), %ecx + xorl %edx, %edx + cmpl (%rsi), %ecx + je L(cmp_0_0) + setg %dl + leal -1(%rdx, %rdx), %eax #else - je L(fin2_7) + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + subl %ecx, %eax #endif - addq $4, %rdi - cmpl %edx, %eax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + ret + + /* Fits in aligning bytes. */ +L(cmp_0_0): + xorl %eax, %eax + ret + +#ifdef USE_AS_WMEMCMP + .p2align 4 +L(ret_nonzero_vec_start_0): + bsfl %eax, %eax + movl (%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax + ret +#else + +# ifndef USE_AS_MEMCMPEQ + .p2align 4,, 14 +L(ret_nonzero): + /* Need to bswap to get proper return without branch. */ + bswapq %rcx + bswapq %rax + subq %rcx, %rax + sbbl %eax, %eax + orl $1, %eax + ret +# endif + + .p2align 4 +L(cmp_0_3): +# ifdef USE_AS_MEMCMPEQ + /* No reason to add to dependency chain on rdx. Saving a the + bytes here doesn't change number of fetch blocks. */ + cmpl $1, %edx + jbe L(cmp_0_1) +# else + /* We need the code size to prevent taking an extra fetch block. + */ + decl %edx + jle L(cmp_0_1) +# endif + movzwl (%rsi), %ecx + movzwl (%rdi), %eax + +# ifdef USE_AS_MEMCMPEQ + subl %ecx, %eax + + movzbl -1(%rsi, %rdx), %esi + movzbl -1(%rdi, %rdx), %edi + subl %edi, %esi + orl %esi, %eax +# else + bswapl %ecx + bswapl %eax + + /* Implicit right shift by one. We just need to displace the + sign bits. */ + shrl %ecx + shrl %eax + + /* Eat a partial register stall here. Saves code stopping + L(cmp_0_3) from bleeding into the next fetch block and saves + an ALU. */ + movb (%rsi, %rdx), %cl + movzbl (%rdi, %rdx), %edi + orl %edi, %eax + subl %ecx, %eax +# endif + ret +#endif + + .p2align 5 +L(more_1x_vec): +#ifndef USE_AS_WMEMCMP + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store + in ecx for code size. This is preferable to using `incw` as + it avoids partial register stalls on older hardware (pre + SnB). */ + movl $0xffff, %ecx +#endif + movups (%rsi), %xmm0 + movups (%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_0) +#if SIZE_OFFSET == 0 + cmpq $(CHAR_PER_VEC * 2), %rdx #else - jnz L(fin2_7) + /* Offset rdx. Saves just enough code size to keep the + L(last_2x_vec) case and the non-zero return in a single + cache line. */ + subq $(CHAR_PER_VEC * 2), %rdx #endif -L(s8b): - testq $8, %r10 - jz L(s16b) - movq (%rdi), %rax - movq (%rdi, %rsi), %rdx - subq $8, %r10 -#ifdef USE_AS_MEMCMPEQ - je L(sub_return8) + ja L(more_2x_vec) + + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax +#ifndef USE_AS_MEMCMPEQ + /* Don't use `incw ax` as machines this code runs on are liable + to have partial register stall. */ + jnz L(ret_nonzero_vec_end_0) #else - je L(fin2_7) + /* Various return targets for memcmpeq. Will always be hot in + Icache and get short encoding. */ +L(ret_nonzero_vec_start_1): +L(ret_nonzero_vec_start_0): +L(ret_nonzero_vec_end_0): #endif - addq $8, %rdi - cmpq %rdx, %rax -#ifdef USE_AS_MEMCMPEQ - jnz L(neq_early) + ret + +#ifndef USE_AS_MEMCMPEQ +# ifdef USE_AS_WMEMCMP + .p2align 4 +L(ret_nonzero_vec_end_0_adj): + addl $3, %edx +# else + .p2align 4,, 8 +# endif +L(ret_nonzero_vec_end_0): + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + leal (%rax, %rdx, CHAR_SIZE), %eax + movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + addl %edx, %eax + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret +# ifndef USE_AS_WMEMCMP + .p2align 4,, 10 +L(ret_nonzero_vec_start_0): + bsfl %eax, %eax + movzbl (%rsi, %rax), %ecx + movzbl (%rdi, %rax), %eax + subl %ecx, %eax + ret +# endif #else - jnz L(fin2_7) #endif -L(s16b): - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 + + .p2align 5 +L(more_2x_vec): + movups (VEC_SIZE * 1)(%rsi), %xmm0 + movups (VEC_SIZE * 1)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + pmovmskb %xmm1, %eax + subl %ecx, %eax + jnz L(ret_nonzero_vec_start_1) + + cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx + jbe L(last_2x_vec) + + cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx + ja L(more_8x_vec) + + /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. + This can harm performance if non-zero return in [65, 80] or + [97, 112] but helps performance otherwise. Generally zero- + return is hotter. */ + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 2)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 3)(%rsi), %xmm2 + movups (VEC_SIZE * 3)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) + jnz L(ret_nonzero_vec_start_2_3) + + cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx + jbe L(last_2x_vec) + + movups (VEC_SIZE * 4)(%rsi), %xmm0 + movups (VEC_SIZE * 4)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 5)(%rsi), %xmm2 + movups (VEC_SIZE * 5)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) #ifdef USE_AS_MEMCMPEQ - pmovmskb %xmm1, %eax - subl $0xffff, %eax + jz L(last_2x_vec) ret #else - pmovmskb %xmm1, %edx - xorl %eax, %eax - subl $0xffff, %edx - jz L(finz) - bsfl %edx, %ecx - leaq (%rdi, %rcx), %rcx - movzbl (%rcx), %eax - movzbl (%rsi, %rcx), %edx - jmp L(finz1) + jnz L(ret_nonzero_vec_start_4_5) #endif - .p2align 4,, 4 -L(finr1b): - movzbl (%rdi), %eax - movzbl (%rsi), %edx -L(finz1): - subl %edx, %eax -L(exit): - ret + .p2align 4 +L(last_2x_vec): + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + pmovmskb %xmm3, %eax + subl %ecx, %eax #ifdef USE_AS_MEMCMPEQ - .p2align 4,, 4 -L(sub_return8): - subq %rdx, %rax - movl %eax, %edx - shrq $32, %rax - orl %edx, %eax + /* Various return targets for memcmpeq. Will always be hot in + Icache and get short encoding. */ +L(ret_nonzero_vec_start_2_3): +L(ret_nonzero_vec_start_4_5): ret #else - .p2align 4,, 4 -L(fin2_7): - cmpq %rdx, %rax - jz L(finz) - movq %rax, %r11 - subq %rdx, %r11 - bsfq %r11, %rcx - sarq $3, %rcx - salq $3, %rcx - sarq %cl, %rax - movzbl %al, %eax - sarq %cl, %rdx - movzbl %dl, %edx - subl %edx, %eax + jnz L(ret_nonzero_vec_end_1) ret -#endif - .p2align 4,, 4 -L(finz): - xorl %eax, %eax + + .p2align 4,, 8 +L(ret_nonzero_vec_end_1): + pmovmskb %xmm1, %ecx + /* High 16 bits of eax guranteed to be all ones. Rotate them in + to we can do `or + not` with just `xor`. */ + rorl $16, %eax + xorl %ecx, %eax + /* Partial register stall. */ + + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + leal (%rax, %rdx, CHAR_SIZE), %eax + movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + addl %edx, %eax + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret -#ifdef USE_AS_MEMCMPEQ - .p2align 4,, 4 -L(neq_early): - movl $1, %eax + + .p2align 4 +L(ret_nonzero_vec_start_4_5): + pmovmskb %xmm1, %edx + sall $16, %eax + leal 1(%rax, %rdx), %eax + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 4)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret + + .p2align 4,, 8 +L(ret_nonzero_vec_start_1): + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 1)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax + subl %ecx, %eax +# endif ret #endif - /* For blocks bigger than 32 bytes - 1. Advance one of the addr pointer to be 16B aligned. - 2. Treat the case of both addr pointers aligned to 16B - separately to avoid movdqu. - 3. Handle any blocks of greater than 64 consecutive bytes with - unrolling to reduce branches. - 4. At least one addr pointer is 16B aligned, use memory version - of pcmbeqb. - */ - .p2align 4,, 4 -L(gt32): - movq %rdx, %r11 - addq %rdi, %r11 - movq %rdi, %r8 - - andq $15, %r8 - jz L(16am) - /* Both pointers may be misaligned. */ - movdqu (%rdi), %xmm1 - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb %xmm0, %xmm1 - pmovmskb %xmm1, %edx - subl $0xffff, %edx - jnz L(neq) - neg %r8 - leaq 16(%rdi, %r8), %rdi -L(16am): - /* Handle two 16B aligned pointers separately. */ - testq $15, %rsi - jz L(ATR) - testq $16, %rdi - jz L(A32) - movdqu (%rdi, %rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi -L(A32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - /* Pre-unroll to be ready for unrolled 64B loop. */ - testq $32, %rdi - jz L(A64) - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(A64): - movq %r11, %r10 - andq $-64, %r10 - cmpq %r10, %rdi - jae L(mt32) - -L(A64main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A64main) - -L(mt32): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - -L(A32main): - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqu (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %rdi, %r10 - jne L(A32main) -L(mt16): - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - - .p2align 4,, 4 -L(neq): -#ifdef USE_AS_MEMCMPEQ - movl $1, %eax - ret -#else - bsfl %edx, %ecx - movzbl (%rdi, %rcx), %eax - addq %rdi, %rsi - movzbl (%rsi,%rcx), %edx - jmp L(finz1) + + .p2align 4 +L(more_8x_vec): + subq %rdi, %rsi + leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx + andq $(VEC_SIZE * -1), %rdi + addq %rdi, %rsi + .p2align 4 +L(loop_4x): + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 3)(%rsi), %xmm1 + + PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 + PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 + + movups (VEC_SIZE * 4)(%rsi), %xmm2 + movups (VEC_SIZE * 5)(%rsi), %xmm3 + + PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 + PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 + + pand %xmm0, %xmm1 + pand %xmm2, %xmm3 + pand %xmm1, %xmm3 + + pmovmskb %xmm3, %eax + subl %ecx, %eax + jnz L(ret_nonzero_loop) + + addq $(VEC_SIZE * 4), %rdi + addq $(VEC_SIZE * 4), %rsi + cmpq %rdi, %rdx + ja L(loop_4x) + /* Get remaining length in edx. */ + subl %edi, %edx + /* Restore offset so we can reuse L(last_2x_vec). */ + addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx +#ifdef USE_AS_WMEMCMP + shrl $2, %edx #endif + cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx + jbe L(last_2x_vec) + - .p2align 4,, 4 -L(ATR): - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - testq $16, %rdi - jz L(ATR32) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - je L(mt16) - -L(ATR32): - movq %r11, %r10 - andq $-64, %r10 - testq $32, %rdi - jz L(ATR64) - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - -L(ATR64): - cmpq %rdi, %r10 - je L(mt32) - -L(ATR64main): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - cmpq %rdi, %r10 - jne L(ATR64main) - - movq %r11, %r10 - andq $-32, %r10 - cmpq %r10, %rdi - jae L(mt16) - -L(ATR32res): - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - movdqa (%rdi,%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - pmovmskb %xmm0, %edx - subl $0xffff, %edx - jnz L(neq) - addq $16, %rdi - - cmpq %r10, %rdi - jne L(ATR32res) - - subq %rdi, %r11 - je L(finz) - movq %r11, %r10 - jmp L(small) - /* Align to 16byte to improve instruction fetch. */ - .p2align 4,, 4 -END(memcmp) + movups (VEC_SIZE * 2)(%rsi), %xmm0 + movups (VEC_SIZE * 2)(%rdi), %xmm1 + PCMPEQ %xmm0, %xmm1 + movups (VEC_SIZE * 3)(%rsi), %xmm2 + movups (VEC_SIZE * 3)(%rdi), %xmm3 + PCMPEQ %xmm2, %xmm3 + pand %xmm1, %xmm3 + pmovmskb %xmm3, %eax + CHECK_CMP (%ecx, %eax) + jz L(last_2x_vec) #ifdef USE_AS_MEMCMPEQ -libc_hidden_def (memcmp) +L(ret_nonzero_loop): + ret #else -# undef bcmp -weak_alias (memcmp, bcmp) -libc_hidden_builtin_def (memcmp) + + .p2align 4 +L(ret_nonzero_vec_start_2_3): + pmovmskb %xmm1, %edx + sall $16, %eax + leal 1(%rax, %rdx), %eax + + bsfl %eax, %eax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret + + .p2align 4 +L(ret_nonzero_loop): + pmovmskb %xmm0, %ecx + pmovmskb %xmm1, %edx + sall $(VEC_SIZE * 1), %edx + leal 1(%rcx, %rdx), %edx + pmovmskb %xmm2, %ecx + /* High 16 bits of eax guranteed to be all ones. Rotate them in + to we can do `or + not` with just `xor`. */ + rorl $16, %eax + xorl %ecx, %eax + + salq $32, %rax + orq %rdx, %rax + + bsfq %rax, %rax +# ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx + xorl %edx, %edx + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx + /* NB: no partial register stall here because xorl zero idiom + above. */ + setg %dl + leal -1(%rdx, %rdx), %eax +# else + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax +# endif + ret +#endif +END(MEMCMP) + +#ifndef USE_AS_WMEMCMP +# ifdef USE_AS_MEMCMPEQ +libc_hidden_def (MEMCMP) +# else +# undef bcmp +weak_alias (MEMCMP, bcmp) +libc_hidden_builtin_def (MEMCMP) +# endif #endif diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S index 2cee881fed..80c5e912a6 100644 --- a/sysdeps/x86_64/memcmpeq.S +++ b/sysdeps/x86_64/memcmpeq.S @@ -16,6 +16,6 @@ License along with the GNU C Library; if not, see <https://www.gnu.org/licenses/>. */ -#define memcmp __memcmpeq +#define MEMCMP __memcmpeq #define USE_AS_MEMCMPEQ 1 #include "multiarch/memcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index e7ea963fc0..b573966966 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -162,8 +162,8 @@ sysdep_routines += \ wmemchr-sse2 \ wmemcmp-avx2-movbe \ wmemcmp-avx2-movbe-rtm \ - wmemcmp-c \ wmemcmp-evex-movbe \ + wmemcmp-sse2 \ wmemcmp-sse4 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S index e10555638d..4080fc1875 100644 --- a/sysdeps/x86_64/multiarch/memcmp-sse2.S +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S @@ -17,8 +17,8 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# ifndef memcmp -# define memcmp __memcmp_sse2 +# ifndef MEMCMP +# define MEMCMP __memcmp_sse2 # endif # ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S index de7f5a7525..9d991e5c74 100644 --- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S +++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S @@ -17,9 +17,9 @@ <https://www.gnu.org/licenses/>. */ #if IS_IN (libc) -# define memcmp __memcmpeq_sse2 +# define MEMCMP __memcmpeq_sse2 #else -# define memcmp __memcmpeq +# define MEMCMP __memcmpeq #endif #define USE_AS_MEMCMPEQ 1 #include "memcmp-sse2.S" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c deleted file mode 100644 index 46b6715e18..0000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-c.c +++ /dev/null @@ -1,9 +0,0 @@ -#if IS_IN (libc) -# include <wchar.h> - -# define WMEMCMP __wmemcmp_sse2 - -extern __typeof (wmemcmp) __wmemcmp_sse2; -#endif - -#include "wcsmbs/wmemcmp.c" diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S new file mode 100644 index 0000000000..57be1c446e --- /dev/null +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S @@ -0,0 +1,25 @@ +/* wmemcmp optimized with SSE2. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#if IS_IN (libc) +# define MEMCMP __wmemcmp_sse2 +#else +# define MEMCMP wmemcmp +#endif +#define USE_AS_WMEMCMP 1 +#include "memcmp-sse2.S" diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S new file mode 100644 index 0000000000..032f389158 --- /dev/null +++ b/sysdeps/x86_64/wmemcmp.S @@ -0,0 +1,21 @@ +/* wmemcmp optimized with SSE2. + Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + <https://www.gnu.org/licenses/>. */ + +#define MEMCMP wmemcmp +#define USE_AS_WMEMCMP 1 +#include "multiarch/memcmp-sse2.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S 2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > New code save size (-303 bytes) and has significantly better > performance. > > geometric_mean(N=20) of page cross cases New / Original: 0.634 > --- > sysdeps/x86_64/memcmp.S | 884 ++++++++++++++--------- > sysdeps/x86_64/memcmpeq.S | 2 +- > sysdeps/x86_64/multiarch/Makefile | 2 +- > sysdeps/x86_64/multiarch/memcmp-sse2.S | 4 +- > sysdeps/x86_64/multiarch/memcmpeq-sse2.S | 4 +- > sysdeps/x86_64/multiarch/wmemcmp-c.c | 9 - > sysdeps/x86_64/multiarch/wmemcmp-sse2.S | 25 + > sysdeps/x86_64/wmemcmp.S | 21 + > 8 files changed, 575 insertions(+), 376 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c > create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S > create mode 100644 sysdeps/x86_64/wmemcmp.S > > diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S > index e02a53ea1e..b153694048 100644 > --- a/sysdeps/x86_64/memcmp.S > +++ b/sysdeps/x86_64/memcmp.S > @@ -18,395 +18,557 @@ > > #include <sysdep.h> > > - .text > -ENTRY (memcmp) > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - movl %edx, %edx > +#ifdef USE_AS_WMEMCMP > +# define PCMPEQ pcmpeqd > +# define CHAR_SIZE 4 > +# define SIZE_OFFSET (0) > +#else > +# define PCMPEQ pcmpeqb > +# define CHAR_SIZE 1 > #endif > - test %RDX_LP, %RDX_LP > - jz L(finz) > - cmpq $1, %rdx > - jbe L(finr1b) > - subq %rdi, %rsi > - movq %rdx, %r10 > - cmpq $32, %r10 > - jae L(gt32) > - /* Handle small chunks and last block of less than 32 bytes. */ > -L(small): > - testq $1, %r10 > - jz L(s2b) > - movzbl (%rdi), %eax > - movzbl (%rdi, %rsi), %edx > - subq $1, %r10 > - je L(finz1) > - addq $1, %rdi > - subl %edx, %eax > - jnz L(exit) > -L(s2b): > - testq $2, %r10 > - jz L(s4b) > - movzwl (%rdi), %eax > - movzwl (%rdi, %rsi), %edx > - subq $2, %r10 > + > #ifdef USE_AS_MEMCMPEQ > - je L(finz1) > +# define SIZE_OFFSET (0) > +# define CHECK_CMP(x, y) subl x, y > #else > - je L(fin2_7) > +# ifndef SIZE_OFFSET > +# define SIZE_OFFSET (CHAR_PER_VEC * 2) > +# endif > +# define CHECK_CMP(x, y) cmpl x, y > #endif > - addq $2, %rdi > - cmpl %edx, %eax > -#ifdef USE_AS_MEMCMPEQ > - jnz L(neq_early) > + > +#define VEC_SIZE 16 > +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) > + > +#ifndef MEMCMP > +# define MEMCMP memcmp > +#endif > + > + .text > +ENTRY(MEMCMP) > +#ifdef USE_AS_WMEMCMP > + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store > + in ecx for code size. This is preferable to using `incw` as > + it avoids partial register stalls on older hardware (pre > + SnB). */ > + movl $0xffff, %ecx > +#endif > + cmpq $CHAR_PER_VEC, %rdx > + ja L(more_1x_vec) > + > +#ifdef USE_AS_WMEMCMP > + /* saves a byte of code keeping the fall through path n = [2, 4] > + in the initial cache line. */ > + decl %edx > + jle L(cmp_0_1) > + > + movq (%rsi), %xmm0 > + movq (%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + subl %ecx, %eax > + jnz L(ret_nonzero_vec_start_0) > + > + movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0 > + movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + subl %ecx, %eax > + jnz L(ret_nonzero_vec_end_0_adj) > #else > - jnz L(fin2_7) > + cmpl $8, %edx > + ja L(cmp_9_16) > + > + cmpl $4, %edx > + jb L(cmp_0_3) > + > +# ifdef USE_AS_MEMCMPEQ > + movl (%rsi), %eax > + subl (%rdi), %eax > + > + movl -4(%rsi, %rdx), %esi > + subl -4(%rdi, %rdx), %esi > + > + orl %esi, %eax > + ret > +# else > + /* Combine comparisons for lo and hi 4-byte comparisons. */ > + movl -4(%rsi, %rdx), %ecx > + movl -4(%rdi, %rdx), %eax > + shlq $32, %rcx > + shlq $32, %rax > + movl (%rsi), %esi > + movl (%rdi), %edi > + orq %rsi, %rcx > + orq %rdi, %rax > + /* Only compute proper return if not-equal. */ > + cmpq %rcx, %rax > + jnz L(ret_nonzero) > + xorl %eax, %eax > + ret > +# endif > + > + .p2align 4,, 10 > +L(cmp_9_16): > +# ifdef USE_AS_MEMCMPEQ > + movq (%rsi), %rax > + subq (%rdi), %rax > + > + movq -8(%rsi, %rdx), %rcx > + subq -8(%rdi, %rdx), %rcx > + orq %rcx, %rax > + /* Convert 64 bit -> 32 bit boolean (we should have made the ABI > + return long). */ > + setnz %cl > + movzbl %cl, %eax > +# else > + movq (%rsi), %rcx > + movq (%rdi), %rax > + /* Only compute proper return if not-equal. */ > + cmpq %rcx, %rax > + jnz L(ret_nonzero) > + > + movq -8(%rsi, %rdx, CHAR_SIZE), %rcx > + movq -8(%rdi, %rdx, CHAR_SIZE), %rax > + /* Only compute proper return if not-equal. */ > + cmpq %rcx, %rax > + jnz L(ret_nonzero) > + xorl %eax, %eax > +# endif > #endif > -L(s4b): > - testq $4, %r10 > - jz L(s8b) > - movl (%rdi), %eax > - movl (%rdi, %rsi), %edx > - subq $4, %r10 > -#ifdef USE_AS_MEMCMPEQ > - je L(finz1) > + ret > + > + .p2align 4,, 8 > +L(cmp_0_1): > + /* Flag set by earlier comparison against 1. */ > + jne L(cmp_0_0) > +#ifdef USE_AS_WMEMCMP > + movl (%rdi), %ecx > + xorl %edx, %edx > + cmpl (%rsi), %ecx > + je L(cmp_0_0) > + setg %dl > + leal -1(%rdx, %rdx), %eax > #else > - je L(fin2_7) > + movzbl (%rdi), %eax > + movzbl (%rsi), %ecx > + subl %ecx, %eax > #endif > - addq $4, %rdi > - cmpl %edx, %eax > -#ifdef USE_AS_MEMCMPEQ > - jnz L(neq_early) > + ret > + > + /* Fits in aligning bytes. */ > +L(cmp_0_0): > + xorl %eax, %eax > + ret > + > +#ifdef USE_AS_WMEMCMP > + .p2align 4 > +L(ret_nonzero_vec_start_0): > + bsfl %eax, %eax > + movl (%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > + ret > +#else > + > +# ifndef USE_AS_MEMCMPEQ > + .p2align 4,, 14 > +L(ret_nonzero): > + /* Need to bswap to get proper return without branch. */ > + bswapq %rcx > + bswapq %rax > + subq %rcx, %rax > + sbbl %eax, %eax > + orl $1, %eax > + ret > +# endif > + > + .p2align 4 > +L(cmp_0_3): > +# ifdef USE_AS_MEMCMPEQ > + /* No reason to add to dependency chain on rdx. Saving a the > + bytes here doesn't change number of fetch blocks. */ > + cmpl $1, %edx > + jbe L(cmp_0_1) > +# else > + /* We need the code size to prevent taking an extra fetch block. > + */ > + decl %edx > + jle L(cmp_0_1) > +# endif > + movzwl (%rsi), %ecx > + movzwl (%rdi), %eax > + > +# ifdef USE_AS_MEMCMPEQ > + subl %ecx, %eax > + > + movzbl -1(%rsi, %rdx), %esi > + movzbl -1(%rdi, %rdx), %edi > + subl %edi, %esi > + orl %esi, %eax > +# else > + bswapl %ecx > + bswapl %eax > + > + /* Implicit right shift by one. We just need to displace the > + sign bits. */ > + shrl %ecx > + shrl %eax > + > + /* Eat a partial register stall here. Saves code stopping > + L(cmp_0_3) from bleeding into the next fetch block and saves > + an ALU. */ > + movb (%rsi, %rdx), %cl > + movzbl (%rdi, %rdx), %edi > + orl %edi, %eax > + subl %ecx, %eax > +# endif > + ret > +#endif > + > + .p2align 5 > +L(more_1x_vec): > +#ifndef USE_AS_WMEMCMP > + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store > + in ecx for code size. This is preferable to using `incw` as > + it avoids partial register stalls on older hardware (pre > + SnB). */ > + movl $0xffff, %ecx > +#endif > + movups (%rsi), %xmm0 > + movups (%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + subl %ecx, %eax > + jnz L(ret_nonzero_vec_start_0) > +#if SIZE_OFFSET == 0 > + cmpq $(CHAR_PER_VEC * 2), %rdx > #else > - jnz L(fin2_7) > + /* Offset rdx. Saves just enough code size to keep the > + L(last_2x_vec) case and the non-zero return in a single > + cache line. */ > + subq $(CHAR_PER_VEC * 2), %rdx > #endif > -L(s8b): > - testq $8, %r10 > - jz L(s16b) > - movq (%rdi), %rax > - movq (%rdi, %rsi), %rdx > - subq $8, %r10 > -#ifdef USE_AS_MEMCMPEQ > - je L(sub_return8) > + ja L(more_2x_vec) > + > + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 > + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + subl %ecx, %eax > +#ifndef USE_AS_MEMCMPEQ > + /* Don't use `incw ax` as machines this code runs on are liable > + to have partial register stall. */ > + jnz L(ret_nonzero_vec_end_0) > #else > - je L(fin2_7) > + /* Various return targets for memcmpeq. Will always be hot in > + Icache and get short encoding. */ > +L(ret_nonzero_vec_start_1): > +L(ret_nonzero_vec_start_0): > +L(ret_nonzero_vec_end_0): > #endif > - addq $8, %rdi > - cmpq %rdx, %rax > -#ifdef USE_AS_MEMCMPEQ > - jnz L(neq_early) > + ret > + > +#ifndef USE_AS_MEMCMPEQ > +# ifdef USE_AS_WMEMCMP > + .p2align 4 > +L(ret_nonzero_vec_end_0_adj): > + addl $3, %edx > +# else > + .p2align 4,, 8 > +# endif > +L(ret_nonzero_vec_end_0): > + bsfl %eax, %eax > +# ifdef USE_AS_WMEMCMP > + leal (%rax, %rdx, CHAR_SIZE), %eax > + movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + addl %edx, %eax > + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > + ret > +# ifndef USE_AS_WMEMCMP > + .p2align 4,, 10 > +L(ret_nonzero_vec_start_0): > + bsfl %eax, %eax > + movzbl (%rsi, %rax), %ecx > + movzbl (%rdi, %rax), %eax > + subl %ecx, %eax > + ret > +# endif > #else > - jnz L(fin2_7) > #endif > -L(s16b): > - movdqu (%rdi), %xmm1 > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb %xmm0, %xmm1 > + > + .p2align 5 > +L(more_2x_vec): > + movups (VEC_SIZE * 1)(%rsi), %xmm0 > + movups (VEC_SIZE * 1)(%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + pmovmskb %xmm1, %eax > + subl %ecx, %eax > + jnz L(ret_nonzero_vec_start_1) > + > + cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx > + jbe L(last_2x_vec) > + > + cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx > + ja L(more_8x_vec) > + > + /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time. > + This can harm performance if non-zero return in [65, 80] or > + [97, 112] but helps performance otherwise. Generally zero- > + return is hotter. */ > + movups (VEC_SIZE * 2)(%rsi), %xmm0 > + movups (VEC_SIZE * 2)(%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + movups (VEC_SIZE * 3)(%rsi), %xmm2 > + movups (VEC_SIZE * 3)(%rdi), %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pand %xmm1, %xmm3 > + > + pmovmskb %xmm3, %eax > + CHECK_CMP (%ecx, %eax) > + jnz L(ret_nonzero_vec_start_2_3) > + > + cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx > + jbe L(last_2x_vec) > + > + movups (VEC_SIZE * 4)(%rsi), %xmm0 > + movups (VEC_SIZE * 4)(%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + movups (VEC_SIZE * 5)(%rsi), %xmm2 > + movups (VEC_SIZE * 5)(%rdi), %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pand %xmm1, %xmm3 > + > + pmovmskb %xmm3, %eax > + CHECK_CMP (%ecx, %eax) > #ifdef USE_AS_MEMCMPEQ > - pmovmskb %xmm1, %eax > - subl $0xffff, %eax > + jz L(last_2x_vec) > ret > #else > - pmovmskb %xmm1, %edx > - xorl %eax, %eax > - subl $0xffff, %edx > - jz L(finz) > - bsfl %edx, %ecx > - leaq (%rdi, %rcx), %rcx > - movzbl (%rcx), %eax > - movzbl (%rsi, %rcx), %edx > - jmp L(finz1) > + jnz L(ret_nonzero_vec_start_4_5) > #endif > - .p2align 4,, 4 > -L(finr1b): > - movzbl (%rdi), %eax > - movzbl (%rsi), %edx > -L(finz1): > - subl %edx, %eax > -L(exit): > - ret > + .p2align 4 > +L(last_2x_vec): > + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0 > + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2 > + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pand %xmm1, %xmm3 > + pmovmskb %xmm3, %eax > + subl %ecx, %eax > #ifdef USE_AS_MEMCMPEQ > - .p2align 4,, 4 > -L(sub_return8): > - subq %rdx, %rax > - movl %eax, %edx > - shrq $32, %rax > - orl %edx, %eax > + /* Various return targets for memcmpeq. Will always be hot in > + Icache and get short encoding. */ > +L(ret_nonzero_vec_start_2_3): > +L(ret_nonzero_vec_start_4_5): > ret > #else > - .p2align 4,, 4 > -L(fin2_7): > - cmpq %rdx, %rax > - jz L(finz) > - movq %rax, %r11 > - subq %rdx, %r11 > - bsfq %r11, %rcx > - sarq $3, %rcx > - salq $3, %rcx > - sarq %cl, %rax > - movzbl %al, %eax > - sarq %cl, %rdx > - movzbl %dl, %edx > - subl %edx, %eax > + jnz L(ret_nonzero_vec_end_1) > ret > -#endif > - .p2align 4,, 4 > -L(finz): > - xorl %eax, %eax > + > + .p2align 4,, 8 > +L(ret_nonzero_vec_end_1): > + pmovmskb %xmm1, %ecx > + /* High 16 bits of eax guranteed to be all ones. Rotate them in > + to we can do `or + not` with just `xor`. */ > + rorl $16, %eax > + xorl %ecx, %eax > + /* Partial register stall. */ > + > + bsfl %eax, %eax > +# ifdef USE_AS_WMEMCMP > + leal (%rax, %rdx, CHAR_SIZE), %eax > + movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + addl %edx, %eax > + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > ret > -#ifdef USE_AS_MEMCMPEQ > - .p2align 4,, 4 > -L(neq_early): > - movl $1, %eax > + > + .p2align 4 > +L(ret_nonzero_vec_start_4_5): > + pmovmskb %xmm1, %edx > + sall $16, %eax > + leal 1(%rax, %rdx), %eax > + bsfl %eax, %eax > +# ifdef USE_AS_WMEMCMP > + movl (VEC_SIZE * 4)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > + ret > + > + .p2align 4,, 8 > +L(ret_nonzero_vec_start_1): > + bsfl %eax, %eax > +# ifdef USE_AS_WMEMCMP > + movl (VEC_SIZE * 1)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > ret > #endif > - /* For blocks bigger than 32 bytes > - 1. Advance one of the addr pointer to be 16B aligned. > - 2. Treat the case of both addr pointers aligned to 16B > - separately to avoid movdqu. > - 3. Handle any blocks of greater than 64 consecutive bytes with > - unrolling to reduce branches. > - 4. At least one addr pointer is 16B aligned, use memory version > - of pcmbeqb. > - */ > - .p2align 4,, 4 > -L(gt32): > - movq %rdx, %r11 > - addq %rdi, %r11 > - movq %rdi, %r8 > - > - andq $15, %r8 > - jz L(16am) > - /* Both pointers may be misaligned. */ > - movdqu (%rdi), %xmm1 > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb %xmm0, %xmm1 > - pmovmskb %xmm1, %edx > - subl $0xffff, %edx > - jnz L(neq) > - neg %r8 > - leaq 16(%rdi, %r8), %rdi > -L(16am): > - /* Handle two 16B aligned pointers separately. */ > - testq $15, %rsi > - jz L(ATR) > - testq $16, %rdi > - jz L(A32) > - movdqu (%rdi, %rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > -L(A32): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jae L(mt16) > - /* Pre-unroll to be ready for unrolled 64B loop. */ > - testq $32, %rdi > - jz L(A64) > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > -L(A64): > - movq %r11, %r10 > - andq $-64, %r10 > - cmpq %r10, %rdi > - jae L(mt32) > - > -L(A64main): > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %rdi, %r10 > - jne L(A64main) > - > -L(mt32): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jae L(mt16) > - > -L(A32main): > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqu (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %rdi, %r10 > - jne L(A32main) > -L(mt16): > - subq %rdi, %r11 > - je L(finz) > - movq %r11, %r10 > - jmp L(small) > - > - .p2align 4,, 4 > -L(neq): > -#ifdef USE_AS_MEMCMPEQ > - movl $1, %eax > - ret > -#else > - bsfl %edx, %ecx > - movzbl (%rdi, %rcx), %eax > - addq %rdi, %rsi > - movzbl (%rsi,%rcx), %edx > - jmp L(finz1) > + > + .p2align 4 > +L(more_8x_vec): > + subq %rdi, %rsi > + leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx > + andq $(VEC_SIZE * -1), %rdi > + addq %rdi, %rsi > + .p2align 4 > +L(loop_4x): > + movups (VEC_SIZE * 2)(%rsi), %xmm0 > + movups (VEC_SIZE * 3)(%rsi), %xmm1 > + > + PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0 > + PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1 > + > + movups (VEC_SIZE * 4)(%rsi), %xmm2 > + movups (VEC_SIZE * 5)(%rsi), %xmm3 > + > + PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2 > + PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3 > + > + pand %xmm0, %xmm1 > + pand %xmm2, %xmm3 > + pand %xmm1, %xmm3 > + > + pmovmskb %xmm3, %eax > + subl %ecx, %eax > + jnz L(ret_nonzero_loop) > + > + addq $(VEC_SIZE * 4), %rdi > + addq $(VEC_SIZE * 4), %rsi > + cmpq %rdi, %rdx > + ja L(loop_4x) > + /* Get remaining length in edx. */ > + subl %edi, %edx > + /* Restore offset so we can reuse L(last_2x_vec). */ > + addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx > +#ifdef USE_AS_WMEMCMP > + shrl $2, %edx > #endif > + cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx > + jbe L(last_2x_vec) > + > > - .p2align 4,, 4 > -L(ATR): > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jae L(mt16) > - testq $16, %rdi > - jz L(ATR32) > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - cmpq %rdi, %r10 > - je L(mt16) > - > -L(ATR32): > - movq %r11, %r10 > - andq $-64, %r10 > - testq $32, %rdi > - jz L(ATR64) > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > -L(ATR64): > - cmpq %rdi, %r10 > - je L(mt32) > - > -L(ATR64main): > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - cmpq %rdi, %r10 > - jne L(ATR64main) > - > - movq %r11, %r10 > - andq $-32, %r10 > - cmpq %r10, %rdi > - jae L(mt16) > - > -L(ATR32res): > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - movdqa (%rdi,%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - pmovmskb %xmm0, %edx > - subl $0xffff, %edx > - jnz L(neq) > - addq $16, %rdi > - > - cmpq %r10, %rdi > - jne L(ATR32res) > - > - subq %rdi, %r11 > - je L(finz) > - movq %r11, %r10 > - jmp L(small) > - /* Align to 16byte to improve instruction fetch. */ > - .p2align 4,, 4 > -END(memcmp) > + movups (VEC_SIZE * 2)(%rsi), %xmm0 > + movups (VEC_SIZE * 2)(%rdi), %xmm1 > + PCMPEQ %xmm0, %xmm1 > + movups (VEC_SIZE * 3)(%rsi), %xmm2 > + movups (VEC_SIZE * 3)(%rdi), %xmm3 > + PCMPEQ %xmm2, %xmm3 > + pand %xmm1, %xmm3 > > + pmovmskb %xmm3, %eax > + CHECK_CMP (%ecx, %eax) > + jz L(last_2x_vec) > #ifdef USE_AS_MEMCMPEQ > -libc_hidden_def (memcmp) > +L(ret_nonzero_loop): > + ret > #else > -# undef bcmp > -weak_alias (memcmp, bcmp) > -libc_hidden_builtin_def (memcmp) > + > + .p2align 4 > +L(ret_nonzero_vec_start_2_3): > + pmovmskb %xmm1, %edx > + sall $16, %eax > + leal 1(%rax, %rdx), %eax > + > + bsfl %eax, %eax > +# ifdef USE_AS_WMEMCMP > + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > + ret > + > + .p2align 4 > +L(ret_nonzero_loop): > + pmovmskb %xmm0, %ecx > + pmovmskb %xmm1, %edx > + sall $(VEC_SIZE * 1), %edx > + leal 1(%rcx, %rdx), %edx > + pmovmskb %xmm2, %ecx > + /* High 16 bits of eax guranteed to be all ones. Rotate them in > + to we can do `or + not` with just `xor`. */ > + rorl $16, %eax > + xorl %ecx, %eax > + > + salq $32, %rax > + orq %rdx, %rax > + > + bsfq %rax, %rax > +# ifdef USE_AS_WMEMCMP > + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx > + xorl %edx, %edx > + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx > + /* NB: no partial register stall here because xorl zero idiom > + above. */ > + setg %dl > + leal -1(%rdx, %rdx), %eax > +# else > + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx > + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax > + subl %ecx, %eax > +# endif > + ret > +#endif > +END(MEMCMP) > + > +#ifndef USE_AS_WMEMCMP > +# ifdef USE_AS_MEMCMPEQ > +libc_hidden_def (MEMCMP) > +# else > +# undef bcmp > +weak_alias (MEMCMP, bcmp) > +libc_hidden_builtin_def (MEMCMP) > +# endif > #endif > diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S > index 2cee881fed..80c5e912a6 100644 > --- a/sysdeps/x86_64/memcmpeq.S > +++ b/sysdeps/x86_64/memcmpeq.S > @@ -16,6 +16,6 @@ > License along with the GNU C Library; if not, see > <https://www.gnu.org/licenses/>. */ > > -#define memcmp __memcmpeq > +#define MEMCMP __memcmpeq > #define USE_AS_MEMCMPEQ 1 > #include "multiarch/memcmp-sse2.S" > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index e7ea963fc0..b573966966 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -162,8 +162,8 @@ sysdep_routines += \ > wmemchr-sse2 \ > wmemcmp-avx2-movbe \ > wmemcmp-avx2-movbe-rtm \ > - wmemcmp-c \ > wmemcmp-evex-movbe \ > + wmemcmp-sse2 \ > wmemcmp-sse4 \ > # sysdep_routines > endif > diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S > index e10555638d..4080fc1875 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-sse2.S > +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S > @@ -17,8 +17,8 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# ifndef memcmp > -# define memcmp __memcmp_sse2 > +# ifndef MEMCMP > +# define MEMCMP __memcmp_sse2 > # endif > > # ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S > index de7f5a7525..9d991e5c74 100644 > --- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S > +++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S > @@ -17,9 +17,9 @@ > <https://www.gnu.org/licenses/>. */ > > #if IS_IN (libc) > -# define memcmp __memcmpeq_sse2 > +# define MEMCMP __memcmpeq_sse2 > #else > -# define memcmp __memcmpeq > +# define MEMCMP __memcmpeq > #endif > #define USE_AS_MEMCMPEQ 1 > #include "memcmp-sse2.S" > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c > deleted file mode 100644 > index 46b6715e18..0000000000 > --- a/sysdeps/x86_64/multiarch/wmemcmp-c.c > +++ /dev/null > @@ -1,9 +0,0 @@ > -#if IS_IN (libc) > -# include <wchar.h> > - > -# define WMEMCMP __wmemcmp_sse2 > - > -extern __typeof (wmemcmp) __wmemcmp_sse2; > -#endif > - > -#include "wcsmbs/wmemcmp.c" > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S > new file mode 100644 > index 0000000000..57be1c446e > --- /dev/null > +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S > @@ -0,0 +1,25 @@ > +/* wmemcmp optimized with SSE2. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#if IS_IN (libc) > +# define MEMCMP __wmemcmp_sse2 > +#else > +# define MEMCMP wmemcmp > +#endif > +#define USE_AS_WMEMCMP 1 > +#include "memcmp-sse2.S" > diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S > new file mode 100644 > index 0000000000..032f389158 > --- /dev/null > +++ b/sysdeps/x86_64/wmemcmp.S > @@ -0,0 +1,21 @@ > +/* wmemcmp optimized with SSE2. > + Copyright (C) 2022 Free Software Foundation, Inc. > + This file is part of the GNU C Library. > + > + The GNU C Library is free software; you can redistribute it and/or > + modify it under the terms of the GNU Lesser General Public > + License as published by the Free Software Foundation; either > + version 2.1 of the License, or (at your option) any later version. > + > + The GNU C Library is distributed in the hope that it will be useful, > + but WITHOUT ANY WARRANTY; without even the implied warranty of > + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > + Lesser General Public License for more details. > + > + You should have received a copy of the GNU Lesser General Public > + License along with the GNU C Library; if not, see > + <https://www.gnu.org/licenses/>. */ > + > +#define MEMCMP wmemcmp > +#define USE_AS_WMEMCMP 1 > +#include "multiarch/memcmp-sse2.S" > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 5/6] x86: Remove memcmp-sse4.S 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (5 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein ` (2 subsequent siblings) 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha Code didn't actually use any sse4 instructions. The new memcmp-sse2 implementation is also faster. geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 Note there are two regressions prefering SSE2 for Size = 1 and Size = 65. Size = 1: size, align0, align1, ret, New Time/Old Time 1, 1, 1, 0, 1.2 1, 1, 1, 1, 1.197 1, 1, 1, -1, 1.2 This is intentional. Size == 1 is significantly less hot based on profiles of GCC11 and Python3 than sizes [4, 8] (which is made hotter). Python3 Size = 1 -> 13.64% Python3 Size = [4, 8] -> 60.92% GCC11 Size = 1 -> 1.29% GCC11 Size = [4, 8] -> 33.86% size, align0, align1, ret, New Time/Old Time 4, 4, 4, 0, 0.622 4, 4, 4, 1, 0.797 4, 4, 4, -1, 0.805 5, 5, 5, 0, 0.623 5, 5, 5, 1, 0.777 5, 5, 5, -1, 0.802 6, 6, 6, 0, 0.625 6, 6, 6, 1, 0.813 6, 6, 6, -1, 0.788 7, 7, 7, 0, 0.625 7, 7, 7, 1, 0.799 7, 7, 7, -1, 0.795 8, 8, 8, 0, 0.625 8, 8, 8, 1, 0.848 8, 8, 8, -1, 0.914 9, 9, 9, 0, 0.625 Size = 65: size, align0, align1, ret, New Time/Old Time 65, 0, 0, 0, 1.103 65, 0, 0, 1, 1.216 65, 0, 0, -1, 1.227 65, 65, 0, 0, 1.091 65, 0, 65, 1, 1.19 65, 65, 65, -1, 1.215 This is because A) the checks in range [65, 96] are now unrolled 2x and B) because smaller values <= 16 are now given a hotter path. By contrast the SSE4 version has a branch for Size = 80. The unrolled version has get better performance for returns which need both comparisons. size, align0, align1, ret, New Time/Old Time 128, 4, 8, 0, 0.858 128, 4, 8, 1, 0.879 128, 4, 8, -1, 0.888 As well, out of microbenchmark environments that are not full predictable the branch will have a real-cost. --- sysdeps/x86_64/multiarch/Makefile | 2 -- sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ---- sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 ---- 3 files changed, 10 deletions(-) diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index b573966966..0400ea332b 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -11,7 +11,6 @@ sysdep_routines += \ memcmp-avx2-movbe-rtm \ memcmp-evex-movbe \ memcmp-sse2 \ - memcmp-sse4 \ memcmpeq-avx2 \ memcmpeq-avx2-rtm \ memcmpeq-evex \ @@ -164,7 +163,6 @@ sysdep_routines += \ wmemcmp-avx2-movbe-rtm \ wmemcmp-evex-movbe \ wmemcmp-sse2 \ - wmemcmp-sse4 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index c6008a73ed..a8afcf81bb 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __memcmp_evex_movbe) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), - __memcmp_sse4_1) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) #ifdef SHARED @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, && CPU_FEATURE_USABLE (BMI2) && CPU_FEATURE_USABLE (MOVBE)), __wmemcmp_evex_movbe) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), - __wmemcmp_sse4_1) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) /* Support sysdeps/x86_64/multiarch/wmemset.c. */ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index 44759a3ad5..c743970fe3 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -20,7 +20,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void) return OPTIMIZE (avx2_movbe); } - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) - return OPTIMIZE (sse4_1); - return OPTIMIZE (sse2); } -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 5/6] x86: Remove memcmp-sse4.S 2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Code didn't actually use any sse4 instructions. The new memcmp-sse2 > implementation is also faster. > > geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 > > Note there are two regressions prefering SSE2 for Size = 1 and Size = > 65. > > Size = 1: > size, align0, align1, ret, New Time/Old Time > 1, 1, 1, 0, 1.2 > 1, 1, 1, 1, 1.197 > 1, 1, 1, -1, 1.2 > > This is intentional. Size == 1 is significantly less hot based on > profiles of GCC11 and Python3 than sizes [4, 8] (which is made > hotter). > > Python3 Size = 1 -> 13.64% > Python3 Size = [4, 8] -> 60.92% > > GCC11 Size = 1 -> 1.29% > GCC11 Size = [4, 8] -> 33.86% > > size, align0, align1, ret, New Time/Old Time > 4, 4, 4, 0, 0.622 > 4, 4, 4, 1, 0.797 > 4, 4, 4, -1, 0.805 > 5, 5, 5, 0, 0.623 > 5, 5, 5, 1, 0.777 > 5, 5, 5, -1, 0.802 > 6, 6, 6, 0, 0.625 > 6, 6, 6, 1, 0.813 > 6, 6, 6, -1, 0.788 > 7, 7, 7, 0, 0.625 > 7, 7, 7, 1, 0.799 > 7, 7, 7, -1, 0.795 > 8, 8, 8, 0, 0.625 > 8, 8, 8, 1, 0.848 > 8, 8, 8, -1, 0.914 > 9, 9, 9, 0, 0.625 > > Size = 65: > size, align0, align1, ret, New Time/Old Time > 65, 0, 0, 0, 1.103 > 65, 0, 0, 1, 1.216 > 65, 0, 0, -1, 1.227 > 65, 65, 0, 0, 1.091 > 65, 0, 65, 1, 1.19 > 65, 65, 65, -1, 1.215 > > This is because A) the checks in range [65, 96] are now unrolled 2x > and B) because smaller values <= 16 are now given a hotter path. By > contrast the SSE4 version has a branch for Size = 80. The unrolled > version has get better performance for returns which need both > comparisons. > > size, align0, align1, ret, New Time/Old Time > 128, 4, 8, 0, 0.858 > 128, 4, 8, 1, 0.879 > 128, 4, 8, -1, 0.888 > > As well, out of microbenchmark environments that are not full > predictable the branch will have a real-cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 -- > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ---- > sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 ---- > 3 files changed, 10 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index b573966966..0400ea332b 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -11,7 +11,6 @@ sysdep_routines += \ > memcmp-avx2-movbe-rtm \ > memcmp-evex-movbe \ > memcmp-sse2 \ > - memcmp-sse4 \ > memcmpeq-avx2 \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > @@ -164,7 +163,6 @@ sysdep_routines += \ > wmemcmp-avx2-movbe-rtm \ > wmemcmp-evex-movbe \ > wmemcmp-sse2 \ > - wmemcmp-sse4 \ > # sysdep_routines > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index c6008a73ed..a8afcf81bb 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (MOVBE)), > __memcmp_evex_movbe) > - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), > - __memcmp_sse4_1) > IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > > #ifdef SHARED > @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > && CPU_FEATURE_USABLE (BMI2) > && CPU_FEATURE_USABLE (MOVBE)), > __wmemcmp_evex_movbe) > - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), > - __wmemcmp_sse4_1) > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemset.c. */ > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > index 44759a3ad5..c743970fe3 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > @@ -20,7 +20,6 @@ > # include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; > @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void) > return OPTIMIZE (avx2_movbe); > } > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > - return OPTIMIZE (sse4_1); > - > return OPTIMIZE (sse2); > } > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (6 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein @ 2022-04-10 0:42 ` Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 9 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw) To: libc-alpha Old code was both inefficient and wasted code size. New code (-62 bytes) and comparable or better performance in the page cross case. geometric_mean(N=20) of page cross cases New / Original: 0.960 size, align0, align1, ret, New Time/Old Time 1, 4095, 0, 0, 1.001 1, 4095, 0, 1, 0.999 1, 4095, 0, -1, 1.0 2, 4094, 0, 0, 1.0 2, 4094, 0, 1, 1.0 2, 4094, 0, -1, 1.0 3, 4093, 0, 0, 1.0 3, 4093, 0, 1, 1.0 3, 4093, 0, -1, 1.0 4, 4092, 0, 0, 0.987 4, 4092, 0, 1, 1.0 4, 4092, 0, -1, 1.0 5, 4091, 0, 0, 0.984 5, 4091, 0, 1, 1.002 5, 4091, 0, -1, 1.005 6, 4090, 0, 0, 0.993 6, 4090, 0, 1, 1.001 6, 4090, 0, -1, 1.003 7, 4089, 0, 0, 0.991 7, 4089, 0, 1, 1.0 7, 4089, 0, -1, 1.001 8, 4088, 0, 0, 0.875 8, 4088, 0, 1, 0.881 8, 4088, 0, -1, 0.888 9, 4087, 0, 0, 0.872 9, 4087, 0, 1, 0.879 9, 4087, 0, -1, 0.883 10, 4086, 0, 0, 0.878 10, 4086, 0, 1, 0.886 10, 4086, 0, -1, 0.873 11, 4085, 0, 0, 0.878 11, 4085, 0, 1, 0.881 11, 4085, 0, -1, 0.879 12, 4084, 0, 0, 0.873 12, 4084, 0, 1, 0.889 12, 4084, 0, -1, 0.875 13, 4083, 0, 0, 0.873 13, 4083, 0, 1, 0.863 13, 4083, 0, -1, 0.863 14, 4082, 0, 0, 0.838 14, 4082, 0, 1, 0.869 14, 4082, 0, -1, 0.877 15, 4081, 0, 0, 0.841 15, 4081, 0, 1, 0.869 15, 4081, 0, -1, 0.876 16, 4080, 0, 0, 0.988 16, 4080, 0, 1, 0.99 16, 4080, 0, -1, 0.989 17, 4079, 0, 0, 0.978 17, 4079, 0, 1, 0.981 17, 4079, 0, -1, 0.98 18, 4078, 0, 0, 0.981 18, 4078, 0, 1, 0.98 18, 4078, 0, -1, 0.985 19, 4077, 0, 0, 0.977 19, 4077, 0, 1, 0.979 19, 4077, 0, -1, 0.986 20, 4076, 0, 0, 0.977 20, 4076, 0, 1, 0.986 20, 4076, 0, -1, 0.984 21, 4075, 0, 0, 0.977 21, 4075, 0, 1, 0.983 21, 4075, 0, -1, 0.988 22, 4074, 0, 0, 0.983 22, 4074, 0, 1, 0.994 22, 4074, 0, -1, 0.993 23, 4073, 0, 0, 0.98 23, 4073, 0, 1, 0.992 23, 4073, 0, -1, 0.995 24, 4072, 0, 0, 0.989 24, 4072, 0, 1, 0.989 24, 4072, 0, -1, 0.991 25, 4071, 0, 0, 0.99 25, 4071, 0, 1, 0.999 25, 4071, 0, -1, 0.996 26, 4070, 0, 0, 0.993 26, 4070, 0, 1, 0.995 26, 4070, 0, -1, 0.998 27, 4069, 0, 0, 0.993 27, 4069, 0, 1, 0.999 27, 4069, 0, -1, 1.0 28, 4068, 0, 0, 0.997 28, 4068, 0, 1, 1.0 28, 4068, 0, -1, 0.999 29, 4067, 0, 0, 0.996 29, 4067, 0, 1, 0.999 29, 4067, 0, -1, 0.999 30, 4066, 0, 0, 0.991 30, 4066, 0, 1, 1.001 30, 4066, 0, -1, 0.999 31, 4065, 0, 0, 0.988 31, 4065, 0, 1, 0.998 31, 4065, 0, -1, 0.998 --- sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- 1 file changed, 61 insertions(+), 37 deletions(-) diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S index a34ea1645d..210c9925b6 100644 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S @@ -429,22 +429,21 @@ L(page_cross_less_vec): # ifndef USE_AS_WMEMCMP cmpl $8, %edx jae L(between_8_15) + /* Fall through for [4, 7]. */ cmpl $4, %edx - jae L(between_4_7) + jb L(between_2_3) - /* Load as big endian to avoid branches. */ - movzwl (%rdi), %eax - movzwl (%rsi), %ecx - shll $8, %eax - shll $8, %ecx - bswap %eax - bswap %ecx - movzbl -1(%rdi, %rdx), %edi - movzbl -1(%rsi, %rdx), %esi - orl %edi, %eax - orl %esi, %ecx - /* Subtraction is okay because the upper 8 bits are zero. */ - subl %ecx, %eax + movbe (%rdi), %eax + movbe (%rsi), %ecx + shlq $32, %rax + shlq $32, %rcx + movbe -4(%rdi, %rdx), %edi + movbe -4(%rsi, %rdx), %esi + orq %rdi, %rax + orq %rsi, %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) /* No ymm register was touched. */ ret @@ -457,9 +456,33 @@ L(one_or_less): /* No ymm register was touched. */ ret + .p2align 4,, 5 +L(ret_nonzero): + sbbl %eax, %eax + orl $1, %eax + /* No ymm register was touched. */ + ret + + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + /* No ymm register was touched. */ + ret + .p2align 4 L(between_8_15): -# endif + movbe (%rdi), %rax + movbe (%rsi), %rcx + subq %rcx, %rax + jnz L(ret_nonzero) + movbe -8(%rdi, %rdx), %rax + movbe -8(%rsi, %rdx), %rcx + subq %rcx, %rax + /* Fast path for return zero. */ + jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret +# else /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ vmovq (%rdi), %xmm1 vmovq (%rsi), %xmm2 @@ -475,16 +498,13 @@ L(between_8_15): VPCMPEQ %xmm1, %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret +# endif - .p2align 4 -L(zero): - xorl %eax, %eax - ret - - .p2align 4 + .p2align 4,, 10 L(between_16_31): /* From 16 to 31 bytes. No branch when size == 16. */ vmovdqu (%rsi), %xmm2 @@ -501,11 +521,17 @@ L(between_16_31): VPCMPEQ (%rdi), %xmm2, %xmm2 vpmovmskb %xmm2, %eax subl $0xffff, %eax + /* Fast path for return zero. */ jnz L(return_vec_0) /* No ymm register was touched. */ ret # ifdef USE_AS_WMEMCMP + .p2align 4,, 2 +L(zero): + xorl %eax, %eax + ret + .p2align 4 L(one_or_less): jb L(zero) @@ -520,22 +546,20 @@ L(one_or_less): # else .p2align 4 -L(between_4_7): - /* Load as big endian with overlapping movbe to avoid branches. - */ - movbe (%rdi), %eax - movbe (%rsi), %ecx - shlq $32, %rax - shlq $32, %rcx - movbe -4(%rdi, %rdx), %edi - movbe -4(%rsi, %rdx), %esi - orq %rdi, %rax - orq %rsi, %rcx - subq %rcx, %rax - jz L(zero_4_7) - sbbl %eax, %eax - orl $1, %eax -L(zero_4_7): +L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx + bswap %eax + bswap %ecx + shrl %eax + shrl %ecx + movzbl -1(%rdi, %rdx), %edi + movzbl -1(%rsi, %rdx), %esi + orl %edi, %eax + orl %esi, %ecx + /* Subtraction is okay because the upper bit is zero. */ + subl %ecx, %eax /* No ymm register was touched. */ ret # endif -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S 2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein @ 2022-04-10 0:48 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw) To: GNU C Library Disregard this patch. It's from the wrong patchset. On Sat, Apr 9, 2022 at 7:47 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > Old code was both inefficient and wasted code size. New code (-62 > bytes) and comparable or better performance in the page cross case. > > geometric_mean(N=20) of page cross cases New / Original: 0.960 > > size, align0, align1, ret, New Time/Old Time > 1, 4095, 0, 0, 1.001 > 1, 4095, 0, 1, 0.999 > 1, 4095, 0, -1, 1.0 > 2, 4094, 0, 0, 1.0 > 2, 4094, 0, 1, 1.0 > 2, 4094, 0, -1, 1.0 > 3, 4093, 0, 0, 1.0 > 3, 4093, 0, 1, 1.0 > 3, 4093, 0, -1, 1.0 > 4, 4092, 0, 0, 0.987 > 4, 4092, 0, 1, 1.0 > 4, 4092, 0, -1, 1.0 > 5, 4091, 0, 0, 0.984 > 5, 4091, 0, 1, 1.002 > 5, 4091, 0, -1, 1.005 > 6, 4090, 0, 0, 0.993 > 6, 4090, 0, 1, 1.001 > 6, 4090, 0, -1, 1.003 > 7, 4089, 0, 0, 0.991 > 7, 4089, 0, 1, 1.0 > 7, 4089, 0, -1, 1.001 > 8, 4088, 0, 0, 0.875 > 8, 4088, 0, 1, 0.881 > 8, 4088, 0, -1, 0.888 > 9, 4087, 0, 0, 0.872 > 9, 4087, 0, 1, 0.879 > 9, 4087, 0, -1, 0.883 > 10, 4086, 0, 0, 0.878 > 10, 4086, 0, 1, 0.886 > 10, 4086, 0, -1, 0.873 > 11, 4085, 0, 0, 0.878 > 11, 4085, 0, 1, 0.881 > 11, 4085, 0, -1, 0.879 > 12, 4084, 0, 0, 0.873 > 12, 4084, 0, 1, 0.889 > 12, 4084, 0, -1, 0.875 > 13, 4083, 0, 0, 0.873 > 13, 4083, 0, 1, 0.863 > 13, 4083, 0, -1, 0.863 > 14, 4082, 0, 0, 0.838 > 14, 4082, 0, 1, 0.869 > 14, 4082, 0, -1, 0.877 > 15, 4081, 0, 0, 0.841 > 15, 4081, 0, 1, 0.869 > 15, 4081, 0, -1, 0.876 > 16, 4080, 0, 0, 0.988 > 16, 4080, 0, 1, 0.99 > 16, 4080, 0, -1, 0.989 > 17, 4079, 0, 0, 0.978 > 17, 4079, 0, 1, 0.981 > 17, 4079, 0, -1, 0.98 > 18, 4078, 0, 0, 0.981 > 18, 4078, 0, 1, 0.98 > 18, 4078, 0, -1, 0.985 > 19, 4077, 0, 0, 0.977 > 19, 4077, 0, 1, 0.979 > 19, 4077, 0, -1, 0.986 > 20, 4076, 0, 0, 0.977 > 20, 4076, 0, 1, 0.986 > 20, 4076, 0, -1, 0.984 > 21, 4075, 0, 0, 0.977 > 21, 4075, 0, 1, 0.983 > 21, 4075, 0, -1, 0.988 > 22, 4074, 0, 0, 0.983 > 22, 4074, 0, 1, 0.994 > 22, 4074, 0, -1, 0.993 > 23, 4073, 0, 0, 0.98 > 23, 4073, 0, 1, 0.992 > 23, 4073, 0, -1, 0.995 > 24, 4072, 0, 0, 0.989 > 24, 4072, 0, 1, 0.989 > 24, 4072, 0, -1, 0.991 > 25, 4071, 0, 0, 0.99 > 25, 4071, 0, 1, 0.999 > 25, 4071, 0, -1, 0.996 > 26, 4070, 0, 0, 0.993 > 26, 4070, 0, 1, 0.995 > 26, 4070, 0, -1, 0.998 > 27, 4069, 0, 0, 0.993 > 27, 4069, 0, 1, 0.999 > 27, 4069, 0, -1, 1.0 > 28, 4068, 0, 0, 0.997 > 28, 4068, 0, 1, 1.0 > 28, 4068, 0, -1, 0.999 > 29, 4067, 0, 0, 0.996 > 29, 4067, 0, 1, 0.999 > 29, 4067, 0, -1, 0.999 > 30, 4066, 0, 0, 0.991 > 30, 4066, 0, 1, 1.001 > 30, 4066, 0, -1, 0.999 > 31, 4065, 0, 0, 0.988 > 31, 4065, 0, 1, 0.998 > 31, 4065, 0, -1, 0.998 > --- > sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- > 1 file changed, 61 insertions(+), 37 deletions(-) > > diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > index a34ea1645d..210c9925b6 100644 > --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S > @@ -429,22 +429,21 @@ L(page_cross_less_vec): > # ifndef USE_AS_WMEMCMP > cmpl $8, %edx > jae L(between_8_15) > + /* Fall through for [4, 7]. */ > cmpl $4, %edx > - jae L(between_4_7) > + jb L(between_2_3) > > - /* Load as big endian to avoid branches. */ > - movzwl (%rdi), %eax > - movzwl (%rsi), %ecx > - shll $8, %eax > - shll $8, %ecx > - bswap %eax > - bswap %ecx > - movzbl -1(%rdi, %rdx), %edi > - movzbl -1(%rsi, %rdx), %esi > - orl %edi, %eax > - orl %esi, %ecx > - /* Subtraction is okay because the upper 8 bits are zero. */ > - subl %ecx, %eax > + movbe (%rdi), %eax > + movbe (%rsi), %ecx > + shlq $32, %rax > + shlq $32, %rcx > + movbe -4(%rdi, %rdx), %edi > + movbe -4(%rsi, %rdx), %esi > + orq %rdi, %rax > + orq %rsi, %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > /* No ymm register was touched. */ > ret > > @@ -457,9 +456,33 @@ L(one_or_less): > /* No ymm register was touched. */ > ret > > + .p2align 4,, 5 > +L(ret_nonzero): > + sbbl %eax, %eax > + orl $1, %eax > + /* No ymm register was touched. */ > + ret > + > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + /* No ymm register was touched. */ > + ret > + > .p2align 4 > L(between_8_15): > -# endif > + movbe (%rdi), %rax > + movbe (%rsi), %rcx > + subq %rcx, %rax > + jnz L(ret_nonzero) > + movbe -8(%rdi, %rdx), %rax > + movbe -8(%rsi, %rdx), %rcx > + subq %rcx, %rax > + /* Fast path for return zero. */ > + jnz L(ret_nonzero) > + /* No ymm register was touched. */ > + ret > +# else > /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ > vmovq (%rdi), %xmm1 > vmovq (%rsi), %xmm2 > @@ -475,16 +498,13 @@ L(between_8_15): > VPCMPEQ %xmm1, %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > +# endif > > - .p2align 4 > -L(zero): > - xorl %eax, %eax > - ret > - > - .p2align 4 > + .p2align 4,, 10 > L(between_16_31): > /* From 16 to 31 bytes. No branch when size == 16. */ > vmovdqu (%rsi), %xmm2 > @@ -501,11 +521,17 @@ L(between_16_31): > VPCMPEQ (%rdi), %xmm2, %xmm2 > vpmovmskb %xmm2, %eax > subl $0xffff, %eax > + /* Fast path for return zero. */ > jnz L(return_vec_0) > /* No ymm register was touched. */ > ret > > # ifdef USE_AS_WMEMCMP > + .p2align 4,, 2 > +L(zero): > + xorl %eax, %eax > + ret > + > .p2align 4 > L(one_or_less): > jb L(zero) > @@ -520,22 +546,20 @@ L(one_or_less): > # else > > .p2align 4 > -L(between_4_7): > - /* Load as big endian with overlapping movbe to avoid branches. > - */ > - movbe (%rdi), %eax > - movbe (%rsi), %ecx > - shlq $32, %rax > - shlq $32, %rcx > - movbe -4(%rdi, %rdx), %edi > - movbe -4(%rsi, %rdx), %esi > - orq %rdi, %rax > - orq %rsi, %rcx > - subq %rcx, %rax > - jz L(zero_4_7) > - sbbl %eax, %eax > - orl $1, %eax > -L(zero_4_7): > +L(between_2_3): > + /* Load as big endian to avoid branches. */ > + movzwl (%rdi), %eax > + movzwl (%rsi), %ecx > + bswap %eax > + bswap %ecx > + shrl %eax > + shrl %ecx > + movzbl -1(%rdi, %rdx), %edi > + movzbl -1(%rsi, %rdx), %esi > + orl %edi, %eax > + orl %esi, %ecx > + /* Subtraction is okay because the upper bit is zero. */ > + subl %ecx, %eax > /* No ymm register was touched. */ > ret > # endif > -- > 2.25.1 > ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (7 preceding siblings ...) 2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein @ 2022-04-10 0:54 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (3 more replies) 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 9 siblings, 4 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - 5 files changed, 2006 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 6507d1b7fa..51222dfab1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -12,7 +12,6 @@ sysdep_routines += \ memcmp-evex-movbe \ memcmp-sse2 \ memcmp-sse4 \ - memcmp-ssse3 \ memcmpeq-avx2 \ memcmpeq-avx2-rtm \ memcmpeq-evex \ @@ -179,7 +178,6 @@ sysdep_routines += \ wmemcmp-c \ wmemcmp-evex-movbe \ wmemcmp-sse4 \ - wmemcmp-ssse3 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 40cc6cc49e..f389928a4e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), - __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) #ifdef SHARED @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), - __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) /* Support sysdeps/x86_64/multiarch/wmemset.c. */ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index cd12613699..44759a3ad5 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -20,7 +20,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S deleted file mode 100644 index df1b1fc494..0000000000 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ /dev/null @@ -1,1992 +0,0 @@ -/* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -# endif - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - atom_text_section -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP - test %RDX_LP, %RDX_LP - jz L(equal) -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -# endif - mov %rdx, %rcx - mov %rdi, %rdx - cmp $48, %rcx; - jae L(48bytesormore) /* LEN => 48 */ - - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -/* ECX >= 32. */ -L(48bytesormore): - movdqu (%rdi), %xmm3 - movdqu (%rsi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%rdi), %rdi - lea 16(%rsi), %rsi - sub $0xffff, %edx - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %rdx, %rdi - sub %rdx, %rsi - add %rdx, %rcx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %rdx, %rsi - -# ifndef USE_AS_WMEMCMP - cmp $8, %edx - jae L(next_unaligned_table) - cmp $0, %edx - je L(shr_0) - cmp $1, %edx - je L(shr_1) - cmp $2, %edx - je L(shr_2) - cmp $3, %edx - je L(shr_3) - cmp $4, %edx - je L(shr_4) - cmp $5, %edx - je L(shr_5) - cmp $6, %edx - je L(shr_6) - jmp L(shr_7) - - .p2align 2 -L(next_unaligned_table): - cmp $8, %edx - je L(shr_8) - cmp $9, %edx - je L(shr_9) - cmp $10, %edx - je L(shr_10) - cmp $11, %edx - je L(shr_11) - cmp $12, %edx - je L(shr_12) - cmp $13, %edx - je L(shr_13) - cmp $14, %edx - je L(shr_14) - jmp L(shr_15) -# else - cmp $0, %edx - je L(shr_0) - cmp $4, %edx - je L(shr_4) - cmp $8, %edx - je L(shr_8) - jmp L(shr_12) -# endif - - .p2align 4 -L(shr_0): - cmp $80, %rcx - lea -48(%rcx), %rcx - jae L(shr_0_gobble) - xor %eax, %eax - movdqa (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_0_gobble): - movdqa (%rsi), %xmm0 - xor %eax, %eax - pcmpeqb (%rdi), %xmm0 - sub $32, %rcx - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 -L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %rcx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%rsi), %xmm0 - movdqa 48(%rsi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%rdi), %xmm0 - pcmpeqb 48(%rdi), %xmm2 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %rcx - jge L(next) - inc %edx - add $32, %rcx -L(next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_1): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_1_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $1, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $1, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $1, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_1_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $1, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $1, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_1_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $1, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $1, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_1_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_1_gobble_next) - inc %edx - add $32, %rcx -L(shr_1_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 1(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - - .p2align 4 -L(shr_2): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $2, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $2, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_2_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $2, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $2, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $2, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $2, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_2_gobble_next) - inc %edx - add $32, %rcx -L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 2(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_3_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $3, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $3, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $3, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $3, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $3, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_3_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $3, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $3, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_3_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_3_gobble_next) - inc %edx - add $32, %rcx -L(shr_3_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 3(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_4): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $4, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $4, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_4_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $4, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $4, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $4, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $4, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_4_gobble_next) - inc %edx - add $32, %rcx -L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 4(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_5): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_5_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $5, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $5, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $5, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_5_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $5, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $5, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_5_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $5, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $5, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_5_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_5_gobble_next) - inc %edx - add $32, %rcx -L(shr_5_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 5(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $6, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $6, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $6, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $6, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $6, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $6, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_6_gobble_next) - inc %edx - add $32, %rcx -L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 6(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_7_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $7, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $7, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $7, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $7, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $7, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_7_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $7, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $7, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_7_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_7_gobble_next) - inc %edx - add $32, %rcx -L(shr_7_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 7(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_8): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $8, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $8, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_8_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $8, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $8, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $8, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $8, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_8_gobble_next) - inc %edx - add $32, %rcx -L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 8(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_9): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_9_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $9, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $9, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $9, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_9_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $9, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $9, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_9_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $9, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $9, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_9_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_9_gobble_next) - inc %edx - add $32, %rcx -L(shr_9_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 9(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $10, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $10, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $10, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $10, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $10, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $10, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_10_gobble_next) - inc %edx - add $32, %rcx -L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 10(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_11_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $11, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $11, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $11, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $11, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $11, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_11_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $11, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $11, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_11_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_11_gobble_next) - inc %edx - add $32, %rcx -L(shr_11_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 11(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_12): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $12, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_12_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $12, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $12, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $12, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $12, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_12_gobble_next) - inc %edx - add $32, %rcx -L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 12(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_13): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_13_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $13, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $13, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $13, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_13_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $13, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $13, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_13_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $13, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $13, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_13_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_13_gobble_next) - inc %edx - add $32, %rcx -L(shr_13_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 13(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $14, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $14, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $14, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $14, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $14, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_14_gobble_next) - inc %edx - add $32, %rcx -L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 14(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_15_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $15, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $15, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $15, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $15, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $15, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_15_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $15, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $15, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_15_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_15_gobble_next) - inc %edx - add $32, %rcx -L(shr_15_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 15(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) -# endif - .p2align 4 -L(exit): - pmovmskb %xmm1, %r8d - sub $0xffff, %r8d - jz L(first16bytes) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - mov %r8d, %edx -L(first16bytes): - add %rax, %rsi -L(less16bytes): -# ifndef USE_AS_WMEMCMP - test %dl, %dl - jz L(next_24_bytes) - - test $0x01, %dl - jnz L(Byte16) - - test $0x02, %dl - jnz L(Byte17) - - test $0x04, %dl - jnz L(Byte18) - - test $0x08, %dl - jnz L(Byte19) - - test $0x10, %dl - jnz L(Byte20) - - test $0x20, %dl - jnz L(Byte21) - - test $0x40, %dl - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte16): - movzbl -16(%rdi), %eax - movzbl -16(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte17): - movzbl -15(%rdi), %eax - movzbl -15(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte18): - movzbl -14(%rdi), %eax - movzbl -14(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte19): - movzbl -13(%rdi), %eax - movzbl -13(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte20): - movzbl -12(%rdi), %eax - movzbl -12(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte21): - movzbl -11(%rdi), %eax - movzbl -11(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte22): - movzbl -10(%rdi), %eax - movzbl -10(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(next_24_bytes): - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - test $0x01, %dh - jnz L(Byte16) - - test $0x02, %dh - jnz L(Byte17) - - test $0x04, %dh - jnz L(Byte18) - - test $0x08, %dh - jnz L(Byte19) - - test $0x10, %dh - jnz L(Byte20) - - test $0x20, %dh - jnz L(Byte21) - - test $0x40, %dh - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret -# else -/* special for wmemcmp */ - xor %eax, %eax - test %dl, %dl - jz L(next_two_double_words) - and $15, %dl - jz L(second_double_word) - mov -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(second_double_word): - mov -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(next_two_double_words): - and $15, %dh - jz L(fourth_double_word) - mov -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(fourth_double_word): - mov -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) - ret -# endif - - .p2align 4 -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $0, %ecx - je L(0bytes) -# ifndef USE_AS_WMEMCMP - cmp $1, %ecx - je L(1bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) -# else - jmp L(4bytes) -# endif - - .p2align 4 -L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) -# ifndef USE_AS_WMEMCMP - cmp $9, %ecx - je L(9bytes) - cmp $10, %ecx - je L(10bytes) - cmp $11, %ecx - je L(11bytes) - cmp $12, %ecx - je L(12bytes) - cmp $13, %ecx - je L(13bytes) - cmp $14, %ecx - je L(14bytes) - jmp L(15bytes) -# else - jmp L(12bytes) -# endif - - .p2align 4 -L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) -# ifndef USE_AS_WMEMCMP - cmp $17, %ecx - je L(17bytes) - cmp $18, %ecx - je L(18bytes) - cmp $19, %ecx - je L(19bytes) - cmp $20, %ecx - je L(20bytes) - cmp $21, %ecx - je L(21bytes) - cmp $22, %ecx - je L(22bytes) - jmp L(23bytes) -# else - jmp L(20bytes) -# endif - - .p2align 4 -L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) -# ifndef USE_AS_WMEMCMP - cmp $25, %ecx - je L(25bytes) - cmp $26, %ecx - je L(26bytes) - cmp $27, %ecx - je L(27bytes) - cmp $28, %ecx - je L(28bytes) - cmp $29, %ecx - je L(29bytes) - cmp $30, %ecx - je L(30bytes) - jmp L(31bytes) -# else - jmp L(28bytes) -# endif - - .p2align 4 -L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) -# ifndef USE_AS_WMEMCMP - cmp $33, %ecx - je L(33bytes) - cmp $34, %ecx - je L(34bytes) - cmp $35, %ecx - je L(35bytes) - cmp $36, %ecx - je L(36bytes) - cmp $37, %ecx - je L(37bytes) - cmp $38, %ecx - je L(38bytes) - jmp L(39bytes) -# else - jmp L(36bytes) -# endif - - .p2align 4 -L(more40bytes): - cmp $40, %ecx - je L(40bytes) -# ifndef USE_AS_WMEMCMP - cmp $41, %ecx - je L(41bytes) - cmp $42, %ecx - je L(42bytes) - cmp $43, %ecx - je L(43bytes) - cmp $44, %ecx - je L(44bytes) - cmp $45, %ecx - je L(45bytes) - cmp $46, %ecx - je L(46bytes) - jmp L(47bytes) - - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - movl -44(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - movl -40(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - movl -36(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - movl -32(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - movl -28(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - movl -24(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - movl -20(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - movl -16(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - movl -12(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - movl -8(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - movl -4(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# else - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - cmp -44(%rsi), %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - cmp -40(%rsi), %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - cmp -36(%rsi), %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - cmp -32(%rsi), %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - cmp -28(%rsi), %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - cmp -24(%rsi), %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - cmp -20(%rsi), %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(45bytes): - movl -45(%rdi), %eax - movl -45(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(41bytes): - movl -41(%rdi), %eax - movl -41(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(37bytes): - movl -37(%rdi), %eax - movl -37(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(33bytes): - movl -33(%rdi), %eax - movl -33(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(29bytes): - movl -29(%rdi), %eax - movl -29(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(25bytes): - movl -25(%rdi), %eax - movl -25(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(21bytes): - movl -21(%rdi), %eax - movl -21(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(17bytes): - movl -17(%rdi), %eax - movl -17(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(13bytes): - movl -13(%rdi), %eax - movl -13(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(9bytes): - movl -9(%rdi), %eax - movl -9(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(5bytes): - movl -5(%rdi), %eax - movl -5(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(1bytes): - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(46bytes): - movl -46(%rdi), %eax - movl -46(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(42bytes): - movl -42(%rdi), %eax - movl -42(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(38bytes): - movl -38(%rdi), %eax - movl -38(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(34bytes): - movl -34(%rdi), %eax - movl -34(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(30bytes): - movl -30(%rdi), %eax - movl -30(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(26bytes): - movl -26(%rdi), %eax - movl -26(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(22bytes): - movl -22(%rdi), %eax - movl -22(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(18bytes): - movl -18(%rdi), %eax - movl -18(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(14bytes): - movl -14(%rdi), %eax - movl -14(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(10bytes): - movl -10(%rdi), %eax - movl -10(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(6bytes): - movl -6(%rdi), %eax - movl -6(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(2bytes): - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(47bytes): - movl -47(%rdi), %eax - movl -47(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(43bytes): - movl -43(%rdi), %eax - movl -43(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(39bytes): - movl -39(%rdi), %eax - movl -39(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(35bytes): - movl -35(%rdi), %eax - movl -35(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(31bytes): - movl -31(%rdi), %eax - movl -31(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(27bytes): - movl -27(%rdi), %eax - movl -27(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(23bytes): - movl -23(%rdi), %eax - movl -23(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(19bytes): - movl -19(%rdi), %eax - movl -19(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(15bytes): - movl -15(%rdi), %eax - movl -15(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(11bytes): - movl -11(%rdi), %eax - movl -11(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(7bytes): - movl -7(%rdi), %eax - movl -7(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(3bytes): - movzwl -3(%rdi), %eax - movzwl -3(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(find_diff): - cmpb %cl, %al - jne L(set) - cmpw %cx, %ax - jne L(set) - shr $16, %eax - shr $16, %ecx - cmpb %cl, %al - jne L(set) - -/* We get there only if we already know there is a -difference. */ - - cmp %ecx, %eax -L(set): - sbb %eax, %eax - sbb $-1, %eax - ret -# else - -/* for wmemcmp */ - .p2align 4 -L(find_diff): - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret -# endif - - .p2align 4 -L(equal): - xor %eax, %eax - ret - -END (MEMCMP) -#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S deleted file mode 100644 index a41ef95fc1..0000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_ssse3 - -#include "memcmp-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-04-10 0:54 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein ` (2 subsequent siblings) 3 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - sysdeps/x86_64/multiarch/strcmp.c | 4 - sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- sysdeps/x86_64/multiarch/strncmp.c | 4 - sysdeps/x86_64/strcmp.S | 155 ++++-------------- 10 files changed, 30 insertions(+), 202 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 51222dfab1..ed2def288d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -58,7 +58,6 @@ sysdep_routines += \ strcasecmp_l-evex \ strcasecmp_l-sse2 \ strcasecmp_l-sse4_2 \ - strcasecmp_l-ssse3 \ strcat-avx2 \ strcat-avx2-rtm \ strcat-evex \ @@ -80,7 +79,6 @@ sysdep_routines += \ strcmp-sse2 \ strcmp-sse2-unaligned \ strcmp-sse4_2 \ - strcmp-ssse3 \ strcpy-avx2 \ strcpy-avx2-rtm \ strcpy-evex \ @@ -98,7 +96,6 @@ sysdep_routines += \ strncase_l-evex \ strncase_l-sse2 \ strncase_l-sse4_2 \ - strncase_l-ssse3 \ strncat-avx2 \ strncat-avx2-rtm \ strncat-c \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncmp-evex \ strncmp-sse2 \ strncmp-sse4_2 \ - strncmp-ssse3 \ strncpy-avx2 \ strncpy-avx2-rtm \ strncpy-c \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f389928a4e..7e2be3554b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), __strcmp_sse42) - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), - __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), __strncmp_sse42) - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), - __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) #ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h index 766539c241..296d32071b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -20,7 +20,6 @@ #include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S deleted file mode 100644 index fb2f9ae14a..0000000000 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strcasecmp_l_ssse3 -#define __strcasecmp __strcasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S deleted file mode 100644 index 1b7fa33c91..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S +++ /dev/null @@ -1,5 +0,0 @@ -#if IS_IN (libc) -# define USE_SSSE3 1 -# define STRCMP __strcmp_ssse3 -# include "../strcmp.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 68cb73baad..a248c2a6e6 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -28,7 +28,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S deleted file mode 100644 index 6728678688..0000000000 --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRNCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strncasecmp_l_ssse3 -#define __strncasecmp __strncasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S deleted file mode 100644 index ec37308347..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S +++ /dev/null @@ -1,28 +0,0 @@ -/* strcmp optimized with SSSE3. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define STRCMP __strncmp_ssse3 - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(strcmp) - -#define USE_SSSE3 1 -#define USE_AS_STRNCMP -#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index fca74199d8..70ae6547c9 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -27,7 +27,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 99d8b36f1d..c38dc627f9 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -59,12 +59,7 @@ # endif #endif -#ifndef USE_SSSE3 .text -#else - .section .text.ssse3,"ax",@progbits -#endif - #ifdef USE_AS_STRCASECMP_L # ifndef ENTRY2 # define ENTRY2(name) ENTRY (name) @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-04-10 0:54 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 3 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - 5 files changed, 879 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ed2def288d..2b3c625ea2 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -63,7 +63,6 @@ sysdep_routines += \ strcat-evex \ strcat-sse2 \ strcat-sse2-unaligned \ - strcat-ssse3 \ strchr-avx2 \ strchr-avx2-rtm \ strchr-evex \ @@ -101,7 +100,6 @@ sysdep_routines += \ strncat-c \ strncat-evex \ strncat-sse2-unaligned \ - strncat-ssse3 \ strncmp-avx2 \ strncmp-avx2-rtm \ strncmp-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 7e2be3554b..41a04621ad 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcat_evex) - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), - __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) @@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncat_evex) - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), - __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 5bece38f78..a15afa44e9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -23,7 +23,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S deleted file mode 100644 index 9f39e4fcd1..0000000000 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ /dev/null @@ -1,866 +0,0 @@ -/* strcat with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_ssse3 -# endif - -# define USE_AS_STRCAT - -.text -ENTRY (STRCAT) -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - -/* Inline corresponding strlen file, temporary until new strcpy - implementation gets merged. */ - - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) - -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(exit) - -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(exit) - -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(exit) - -L(aligned_64_exit_16): - lea -48(%rax), %rax - -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail1): - add $1, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail2): - add $2, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail3): - add $3, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail4): - add $4, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail5): - add $5, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail6): - add $6, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail7): - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail8): - add $8, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail9): - add $9, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail10): - add $10, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail11): - add $11, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail12): - add $12, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail13): - add $13, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail14): - add $14, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail15): - add $15, %eax - - .p2align 4 -L(StartStrcpyPart): - mov %rsi, %rcx - lea (%rdi, %rax), %rdx -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(StrncatExit0) - cmp $8, %r8 - jbe L(StrncatExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) - cmpb $0, 8(%rcx) - jz L(Exit9) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - jb L(StrncatExit15Bytes) -# endif - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) - cmpb $0, 15(%rcx) - jz L(Exit16) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - je L(StrncatExit16) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-ssse3.S" - - .p2align 4 -L(CopyFrom1To16Bytes): - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit1): - xor %ah, %ah - movb %ah, 1(%rdx) -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit2): - xor %ah, %ah - movb %ah, 2(%rdx) -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit3): - xor %ah, %ah - movb %ah, 3(%rdx) -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit4): - xor %ah, %ah - movb %ah, 4(%rdx) -L(Exit4): - mov (%rcx), %eax - mov %eax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit5): - xor %ah, %ah - movb %ah, 5(%rdx) -L(Exit5): - mov (%rcx), %eax - mov %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit6): - xor %ah, %ah - movb %ah, 6(%rdx) -L(Exit6): - mov (%rcx), %eax - mov %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit7): - xor %ah, %ah - movb %ah, 7(%rdx) -L(Exit7): - mov (%rcx), %eax - mov %eax, (%rdx) - mov 3(%rcx), %eax - mov %eax, 3(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8): - xor %ah, %ah - movb %ah, 8(%rdx) -L(Exit8): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit9): - xor %ah, %ah - movb %ah, 9(%rdx) -L(Exit9): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movb 8(%rcx), %al - movb %al, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit10): - xor %ah, %ah - movb %ah, 10(%rdx) -L(Exit10): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movw 8(%rcx), %ax - movw %ax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit11): - xor %ah, %ah - movb %ah, 11(%rdx) -L(Exit11): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit12): - xor %ah, %ah - movb %ah, 12(%rdx) -L(Exit12): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit13): - xor %ah, %ah - movb %ah, 13(%rdx) -L(Exit13): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 5(%rcx), %xmm1 - movlpd %xmm1, 5(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit14): - xor %ah, %ah - movb %ah, 14(%rdx) -L(Exit14): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 6(%rcx), %xmm1 - movlpd %xmm1, 6(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15): - xor %ah, %ah - movb %ah, 15(%rdx) -L(Exit15): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit16): - xor %ah, %ah - movb %ah, 16(%rdx) -L(Exit16): - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - test $0x01, %al - jnz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - test $0x02, %al - jnz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - test $0x04, %al - jnz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - test $0x08, %al - jnz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - test $0x10, %al - jnz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - test $0x20, %al - jnz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - test $0x40, %al - jnz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase2): - test $0x01, %ah - jnz L(Exit9) - cmp $9, %r8 - je L(StrncatExit9) - test $0x02, %ah - jnz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - test $0x04, %ah - jnz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - test $0x8, %ah - jnz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - test $0x10, %ah - jnz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - test $0x20, %ah - jnz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - test $0x40, %ah - jnz L(Exit15) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $8, %r8 - ja L(ExitHighCase3) - cmp $1, %r8 - je L(StrncatExit1) - cmp $2, %r8 - je L(StrncatExit2) - cmp $3, %r8 - je L(StrncatExit3) - cmp $4, %r8 - je L(StrncatExit4) - cmp $5, %r8 - je L(StrncatExit5) - cmp $6, %r8 - je L(StrncatExit6) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - xor %ah, %ah - movb %ah, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase3): - cmp $9, %r8 - je L(StrncatExit9) - cmp $10, %r8 - je L(StrncatExit10) - cmp $11, %r8 - je L(StrncatExit11) - cmp $12, %r8 - je L(StrncatExit12) - cmp $13, %r8 - je L(StrncatExit13) - cmp $14, %r8 - je L(StrncatExit14) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - xor %ah, %ah - movb %ah, 16(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit0): - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15Bytes): - cmp $9, %r8 - je L(StrncatExit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8Bytes): - cmpb $0, (%rcx) - jz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - -# endif -END (STRCAT) -#endif diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S deleted file mode 100644 index 6c45ff3ec7..0000000000 --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCAT -#define STRCAT __strncat_ssse3 -#include "strcat-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-04-10 0:54 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 3 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - 6 files changed, 3572 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2b3c625ea2..5b02ec8de5 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -46,13 +46,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -83,7 +81,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 41a04621ad..49ce6860d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (2 preceding siblings ...) 2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein @ 2022-04-10 0:54 ` Noah Goldstein 3 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +- sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - 5 files changed, 6 insertions(+), 3212 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5b02ec8de5..303fb5d734 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,6 @@ sysdep_routines += \ memcmpeq-evex \ memcmpeq-sse2 \ memcpy-ssse3 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -25,7 +24,6 @@ sysdep_routines += \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ memmove-ssse3 \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 49ce6860d0..c6008a73ed 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3) @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3) @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3) @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..fb01fbb301 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + return OPTIMIZE (ssse3); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) -L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx -L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy_fwd): - sub $0x80, %rdx -L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx -L(ll_cache_copy_fwd): - add %rcx, %rdx -L(ll_cache_copy_fwd_start): - sub $0x80, %rdx -L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) -L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx -L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy): - sub $0x80, %rdx -L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx -L(ll_cache_copy): - add %rcx, %rdx -L(ll_cache_copy_bwd_start): - sub $0x80, %rdx -L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) -L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -L(fwd_write_0bytes): - ret - - - .p2align 4 -L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -L(bwd_write_0bytes): - ret - - .p2align 4 -L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - - .p2align 4 -L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - - .p2align 3 -L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - - .p2align 3 -L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S deleted file mode 100644 index f9a4e9aff9..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_back -#define MEMCPY_CHK __memmove_chk_ssse3_back -#include "memcpy-ssse3-back.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (8 preceding siblings ...) 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein ` (5 more replies) 9 siblings, 6 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - 5 files changed, 2006 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 6507d1b7fa..51222dfab1 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -12,7 +12,6 @@ sysdep_routines += \ memcmp-evex-movbe \ memcmp-sse2 \ memcmp-sse4 \ - memcmp-ssse3 \ memcmpeq-avx2 \ memcmpeq-avx2-rtm \ memcmpeq-evex \ @@ -179,7 +178,6 @@ sysdep_routines += \ wmemcmp-c \ wmemcmp-evex-movbe \ wmemcmp-sse4 \ - wmemcmp-ssse3 \ # sysdep_routines endif diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 40cc6cc49e..f389928a4e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), __memcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), - __memcmp_ssse3) IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) #ifdef SHARED @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __wmemcmp_evex_movbe) IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), __wmemcmp_sse4_1) - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), - __wmemcmp_ssse3) IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) /* Support sysdeps/x86_64/multiarch/wmemset.c. */ diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h index cd12613699..44759a3ad5 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h @@ -20,7 +20,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) return OPTIMIZE (sse4_1); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S deleted file mode 100644 index df1b1fc494..0000000000 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S +++ /dev/null @@ -1,1992 +0,0 @@ -/* memcmp with SSSE3, wmemcmp with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef MEMCMP -# define MEMCMP __memcmp_ssse3 -# endif - -/* Warning! - wmemcmp has to use SIGNED comparison for elements. - memcmp has to use UNSIGNED comparison for elemnts. -*/ - - atom_text_section -ENTRY (MEMCMP) -# ifdef USE_AS_WMEMCMP - shl $2, %RDX_LP - test %RDX_LP, %RDX_LP - jz L(equal) -# elif defined __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -# endif - mov %rdx, %rcx - mov %rdi, %rdx - cmp $48, %rcx; - jae L(48bytesormore) /* LEN => 48 */ - - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -/* ECX >= 32. */ -L(48bytesormore): - movdqu (%rdi), %xmm3 - movdqu (%rsi), %xmm0 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %edx - lea 16(%rdi), %rdi - lea 16(%rsi), %rsi - sub $0xffff, %edx - jnz L(less16bytes) - mov %edi, %edx - and $0xf, %edx - xor %rdx, %rdi - sub %rdx, %rsi - add %rdx, %rcx - mov %esi, %edx - and $0xf, %edx - jz L(shr_0) - xor %rdx, %rsi - -# ifndef USE_AS_WMEMCMP - cmp $8, %edx - jae L(next_unaligned_table) - cmp $0, %edx - je L(shr_0) - cmp $1, %edx - je L(shr_1) - cmp $2, %edx - je L(shr_2) - cmp $3, %edx - je L(shr_3) - cmp $4, %edx - je L(shr_4) - cmp $5, %edx - je L(shr_5) - cmp $6, %edx - je L(shr_6) - jmp L(shr_7) - - .p2align 2 -L(next_unaligned_table): - cmp $8, %edx - je L(shr_8) - cmp $9, %edx - je L(shr_9) - cmp $10, %edx - je L(shr_10) - cmp $11, %edx - je L(shr_11) - cmp $12, %edx - je L(shr_12) - cmp $13, %edx - je L(shr_13) - cmp $14, %edx - je L(shr_14) - jmp L(shr_15) -# else - cmp $0, %edx - je L(shr_0) - cmp $4, %edx - je L(shr_4) - cmp $8, %edx - je L(shr_8) - jmp L(shr_12) -# endif - - .p2align 4 -L(shr_0): - cmp $80, %rcx - lea -48(%rcx), %rcx - jae L(shr_0_gobble) - xor %eax, %eax - movdqa (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 - pand %xmm1, %xmm2 - pmovmskb %xmm2, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_0_gobble): - movdqa (%rsi), %xmm0 - xor %eax, %eax - pcmpeqb (%rdi), %xmm0 - sub $32, %rcx - movdqa 16(%rsi), %xmm2 - pcmpeqb 16(%rdi), %xmm2 -L(shr_0_gobble_loop): - pand %xmm0, %xmm2 - sub $32, %rcx - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - movdqa 32(%rsi), %xmm0 - movdqa 48(%rsi), %xmm2 - sbb $0xffff, %edx - pcmpeqb 32(%rdi), %xmm0 - pcmpeqb 48(%rdi), %xmm2 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - jz L(shr_0_gobble_loop) - - pand %xmm0, %xmm2 - cmp $0, %rcx - jge L(next) - inc %edx - add $32, %rcx -L(next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm2, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_1): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_1_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $1, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $1, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $1, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_1_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $1, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $1, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_1_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $1, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $1, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_1_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_1_gobble_next) - inc %edx - add $32, %rcx -L(shr_1_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 1(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - - .p2align 4 -L(shr_2): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_2_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $2, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $2, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $2, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_2_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $2, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $2, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_2_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $2, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $2, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_2_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_2_gobble_next) - inc %edx - add $32, %rcx -L(shr_2_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 2(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_3_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $3, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $3, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $3, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_3_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $3, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $3, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_3_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $3, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $3, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_3_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_3_gobble_next) - inc %edx - add $32, %rcx -L(shr_3_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 3(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_4): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_4_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $4, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $4, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $4, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_4_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $4, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $4, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_4_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $4, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $4, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_4_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_4_gobble_next) - inc %edx - add $32, %rcx -L(shr_4_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 4(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_5): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_5_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $5, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $5, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $5, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_5_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $5, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $5, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_5_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $5, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $5, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_5_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_5_gobble_next) - inc %edx - add $32, %rcx -L(shr_5_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 5(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_6_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $6, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $6, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $6, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_6_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $6, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $6, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_6_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $6, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $6, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_6_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_6_gobble_next) - inc %edx - add $32, %rcx -L(shr_6_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 6(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_7_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $7, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $7, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $7, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_7_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $7, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $7, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_7_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $7, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $7, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_7_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_7_gobble_next) - inc %edx - add $32, %rcx -L(shr_7_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 7(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_8): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_8_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $8, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $8, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $8, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_8_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $8, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $8, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_8_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $8, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $8, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_8_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_8_gobble_next) - inc %edx - add $32, %rcx -L(shr_8_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 8(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_9): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_9_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $9, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $9, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $9, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_9_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $9, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $9, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_9_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $9, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $9, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_9_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_9_gobble_next) - inc %edx - add $32, %rcx -L(shr_9_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 9(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_10_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $10, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $10, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $10, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_10_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $10, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $10, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_10_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $10, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $10, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_10_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_10_gobble_next) - inc %edx - add $32, %rcx -L(shr_10_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 10(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_11_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $11, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $11, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $11, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_11_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $11, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $11, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_11_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $11, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $11, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_11_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_11_gobble_next) - inc %edx - add $32, %rcx -L(shr_11_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 11(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# endif - - .p2align 4 -L(shr_12): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_12_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $12, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $12, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $12, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_12_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $12, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $12, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_12_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $12, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $12, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_12_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_12_gobble_next) - inc %edx - add $32, %rcx -L(shr_12_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 12(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - -# ifndef USE_AS_WMEMCMP - - .p2align 4 -L(shr_13): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_13_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $13, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $13, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $13, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_13_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $13, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $13, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_13_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $13, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $13, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_13_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_13_gobble_next) - inc %edx - add $32, %rcx -L(shr_13_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 13(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_14_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $14, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $14, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $14, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_14_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $14, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $14, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_14_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $14, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $14, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_14_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_14_gobble_next) - inc %edx - add $32, %rcx -L(shr_14_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 14(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15): - cmp $80, %rcx - lea -48(%rcx), %rcx - mov %edx, %eax - jae L(shr_15_gobble) - - movdqa 16(%rsi), %xmm1 - movdqa %xmm1, %xmm2 - palignr $15, (%rsi), %xmm1 - pcmpeqb (%rdi), %xmm1 - - movdqa 32(%rsi), %xmm3 - palignr $15, %xmm2, %xmm3 - pcmpeqb 16(%rdi), %xmm3 - - pand %xmm1, %xmm3 - pmovmskb %xmm3, %edx - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - add $15, %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) - - .p2align 4 -L(shr_15_gobble): - sub $32, %rcx - movdqa 16(%rsi), %xmm0 - palignr $15, (%rsi), %xmm0 - pcmpeqb (%rdi), %xmm0 - - movdqa 32(%rsi), %xmm3 - palignr $15, 16(%rsi), %xmm3 - pcmpeqb 16(%rdi), %xmm3 - -L(shr_15_gobble_loop): - pand %xmm0, %xmm3 - sub $32, %rcx - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - - movdqa 64(%rsi), %xmm3 - palignr $15, 48(%rsi), %xmm3 - sbb $0xffff, %edx - movdqa 48(%rsi), %xmm0 - palignr $15, 32(%rsi), %xmm0 - pcmpeqb 32(%rdi), %xmm0 - lea 32(%rsi), %rsi - pcmpeqb 48(%rdi), %xmm3 - - lea 32(%rdi), %rdi - jz L(shr_15_gobble_loop) - pand %xmm0, %xmm3 - - cmp $0, %rcx - jge L(shr_15_gobble_next) - inc %edx - add $32, %rcx -L(shr_15_gobble_next): - test %edx, %edx - jnz L(exit) - - pmovmskb %xmm3, %edx - movdqa %xmm0, %xmm1 - lea 32(%rdi), %rdi - lea 32(%rsi), %rsi - sub $0xffff, %edx - jnz L(exit) - - lea 15(%rsi), %rsi - add %rcx, %rsi - add %rcx, %rdi - jmp L(less48bytes) -# endif - .p2align 4 -L(exit): - pmovmskb %xmm1, %r8d - sub $0xffff, %r8d - jz L(first16bytes) - lea -16(%rsi), %rsi - lea -16(%rdi), %rdi - mov %r8d, %edx -L(first16bytes): - add %rax, %rsi -L(less16bytes): -# ifndef USE_AS_WMEMCMP - test %dl, %dl - jz L(next_24_bytes) - - test $0x01, %dl - jnz L(Byte16) - - test $0x02, %dl - jnz L(Byte17) - - test $0x04, %dl - jnz L(Byte18) - - test $0x08, %dl - jnz L(Byte19) - - test $0x10, %dl - jnz L(Byte20) - - test $0x20, %dl - jnz L(Byte21) - - test $0x40, %dl - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte16): - movzbl -16(%rdi), %eax - movzbl -16(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte17): - movzbl -15(%rdi), %eax - movzbl -15(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte18): - movzbl -14(%rdi), %eax - movzbl -14(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte19): - movzbl -13(%rdi), %eax - movzbl -13(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte20): - movzbl -12(%rdi), %eax - movzbl -12(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte21): - movzbl -11(%rdi), %eax - movzbl -11(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(Byte22): - movzbl -10(%rdi), %eax - movzbl -10(%rsi), %edx - sub %edx, %eax - ret - - .p2align 4 -L(next_24_bytes): - lea 8(%rdi), %rdi - lea 8(%rsi), %rsi - test $0x01, %dh - jnz L(Byte16) - - test $0x02, %dh - jnz L(Byte17) - - test $0x04, %dh - jnz L(Byte18) - - test $0x08, %dh - jnz L(Byte19) - - test $0x10, %dh - jnz L(Byte20) - - test $0x20, %dh - jnz L(Byte21) - - test $0x40, %dh - jnz L(Byte22) - - movzbl -9(%rdi), %eax - movzbl -9(%rsi), %edx - sub %edx, %eax - ret -# else -/* special for wmemcmp */ - xor %eax, %eax - test %dl, %dl - jz L(next_two_double_words) - and $15, %dl - jz L(second_double_word) - mov -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(second_double_word): - mov -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(next_two_double_words): - and $15, %dh - jz L(fourth_double_word) - mov -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) - ret - - .p2align 4 -L(fourth_double_word): - mov -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) - ret -# endif - - .p2align 4 -L(less48bytes): - cmp $8, %ecx - jae L(more8bytes) - cmp $0, %ecx - je L(0bytes) -# ifndef USE_AS_WMEMCMP - cmp $1, %ecx - je L(1bytes) - cmp $2, %ecx - je L(2bytes) - cmp $3, %ecx - je L(3bytes) - cmp $4, %ecx - je L(4bytes) - cmp $5, %ecx - je L(5bytes) - cmp $6, %ecx - je L(6bytes) - jmp L(7bytes) -# else - jmp L(4bytes) -# endif - - .p2align 4 -L(more8bytes): - cmp $16, %ecx - jae L(more16bytes) - cmp $8, %ecx - je L(8bytes) -# ifndef USE_AS_WMEMCMP - cmp $9, %ecx - je L(9bytes) - cmp $10, %ecx - je L(10bytes) - cmp $11, %ecx - je L(11bytes) - cmp $12, %ecx - je L(12bytes) - cmp $13, %ecx - je L(13bytes) - cmp $14, %ecx - je L(14bytes) - jmp L(15bytes) -# else - jmp L(12bytes) -# endif - - .p2align 4 -L(more16bytes): - cmp $24, %ecx - jae L(more24bytes) - cmp $16, %ecx - je L(16bytes) -# ifndef USE_AS_WMEMCMP - cmp $17, %ecx - je L(17bytes) - cmp $18, %ecx - je L(18bytes) - cmp $19, %ecx - je L(19bytes) - cmp $20, %ecx - je L(20bytes) - cmp $21, %ecx - je L(21bytes) - cmp $22, %ecx - je L(22bytes) - jmp L(23bytes) -# else - jmp L(20bytes) -# endif - - .p2align 4 -L(more24bytes): - cmp $32, %ecx - jae L(more32bytes) - cmp $24, %ecx - je L(24bytes) -# ifndef USE_AS_WMEMCMP - cmp $25, %ecx - je L(25bytes) - cmp $26, %ecx - je L(26bytes) - cmp $27, %ecx - je L(27bytes) - cmp $28, %ecx - je L(28bytes) - cmp $29, %ecx - je L(29bytes) - cmp $30, %ecx - je L(30bytes) - jmp L(31bytes) -# else - jmp L(28bytes) -# endif - - .p2align 4 -L(more32bytes): - cmp $40, %ecx - jae L(more40bytes) - cmp $32, %ecx - je L(32bytes) -# ifndef USE_AS_WMEMCMP - cmp $33, %ecx - je L(33bytes) - cmp $34, %ecx - je L(34bytes) - cmp $35, %ecx - je L(35bytes) - cmp $36, %ecx - je L(36bytes) - cmp $37, %ecx - je L(37bytes) - cmp $38, %ecx - je L(38bytes) - jmp L(39bytes) -# else - jmp L(36bytes) -# endif - - .p2align 4 -L(more40bytes): - cmp $40, %ecx - je L(40bytes) -# ifndef USE_AS_WMEMCMP - cmp $41, %ecx - je L(41bytes) - cmp $42, %ecx - je L(42bytes) - cmp $43, %ecx - je L(43bytes) - cmp $44, %ecx - je L(44bytes) - cmp $45, %ecx - je L(45bytes) - cmp $46, %ecx - je L(46bytes) - jmp L(47bytes) - - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - movl -44(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - movl -40(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - movl -36(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - movl -32(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - movl -28(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - movl -24(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - movl -20(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - movl -16(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - movl -12(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - movl -8(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - movl -4(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# else - .p2align 4 -L(44bytes): - movl -44(%rdi), %eax - cmp -44(%rsi), %eax - jne L(find_diff) -L(40bytes): - movl -40(%rdi), %eax - cmp -40(%rsi), %eax - jne L(find_diff) -L(36bytes): - movl -36(%rdi), %eax - cmp -36(%rsi), %eax - jne L(find_diff) -L(32bytes): - movl -32(%rdi), %eax - cmp -32(%rsi), %eax - jne L(find_diff) -L(28bytes): - movl -28(%rdi), %eax - cmp -28(%rsi), %eax - jne L(find_diff) -L(24bytes): - movl -24(%rdi), %eax - cmp -24(%rsi), %eax - jne L(find_diff) -L(20bytes): - movl -20(%rdi), %eax - cmp -20(%rsi), %eax - jne L(find_diff) -L(16bytes): - movl -16(%rdi), %eax - cmp -16(%rsi), %eax - jne L(find_diff) -L(12bytes): - movl -12(%rdi), %eax - cmp -12(%rsi), %eax - jne L(find_diff) -L(8bytes): - movl -8(%rdi), %eax - cmp -8(%rsi), %eax - jne L(find_diff) -L(4bytes): - movl -4(%rdi), %eax - cmp -4(%rsi), %eax - jne L(find_diff) -L(0bytes): - xor %eax, %eax - ret -# endif - -# ifndef USE_AS_WMEMCMP - .p2align 4 -L(45bytes): - movl -45(%rdi), %eax - movl -45(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(41bytes): - movl -41(%rdi), %eax - movl -41(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(37bytes): - movl -37(%rdi), %eax - movl -37(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(33bytes): - movl -33(%rdi), %eax - movl -33(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(29bytes): - movl -29(%rdi), %eax - movl -29(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(25bytes): - movl -25(%rdi), %eax - movl -25(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(21bytes): - movl -21(%rdi), %eax - movl -21(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(17bytes): - movl -17(%rdi), %eax - movl -17(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(13bytes): - movl -13(%rdi), %eax - movl -13(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(9bytes): - movl -9(%rdi), %eax - movl -9(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(5bytes): - movl -5(%rdi), %eax - movl -5(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(1bytes): - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(46bytes): - movl -46(%rdi), %eax - movl -46(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(42bytes): - movl -42(%rdi), %eax - movl -42(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(38bytes): - movl -38(%rdi), %eax - movl -38(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(34bytes): - movl -34(%rdi), %eax - movl -34(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(30bytes): - movl -30(%rdi), %eax - movl -30(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(26bytes): - movl -26(%rdi), %eax - movl -26(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(22bytes): - movl -22(%rdi), %eax - movl -22(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(18bytes): - movl -18(%rdi), %eax - movl -18(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(14bytes): - movl -14(%rdi), %eax - movl -14(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(10bytes): - movl -10(%rdi), %eax - movl -10(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(6bytes): - movl -6(%rdi), %eax - movl -6(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(2bytes): - movzwl -2(%rdi), %eax - movzwl -2(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(47bytes): - movl -47(%rdi), %eax - movl -47(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(43bytes): - movl -43(%rdi), %eax - movl -43(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(39bytes): - movl -39(%rdi), %eax - movl -39(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(35bytes): - movl -35(%rdi), %eax - movl -35(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(31bytes): - movl -31(%rdi), %eax - movl -31(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(27bytes): - movl -27(%rdi), %eax - movl -27(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(23bytes): - movl -23(%rdi), %eax - movl -23(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(19bytes): - movl -19(%rdi), %eax - movl -19(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(15bytes): - movl -15(%rdi), %eax - movl -15(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(11bytes): - movl -11(%rdi), %eax - movl -11(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(7bytes): - movl -7(%rdi), %eax - movl -7(%rsi), %ecx - cmp %ecx, %eax - jne L(find_diff) -L(3bytes): - movzwl -3(%rdi), %eax - movzwl -3(%rsi), %ecx - cmpb %cl, %al - jne L(set) - cmp %ecx, %eax - jne L(set) - movzbl -1(%rdi), %eax - cmpb -1(%rsi), %al - jne L(set) - xor %eax, %eax - ret - - .p2align 4 -L(find_diff): - cmpb %cl, %al - jne L(set) - cmpw %cx, %ax - jne L(set) - shr $16, %eax - shr $16, %ecx - cmpb %cl, %al - jne L(set) - -/* We get there only if we already know there is a -difference. */ - - cmp %ecx, %eax -L(set): - sbb %eax, %eax - sbb $-1, %eax - ret -# else - -/* for wmemcmp */ - .p2align 4 -L(find_diff): - mov $1, %eax - jg L(find_diff_bigger) - neg %eax - ret - - .p2align 4 -L(find_diff_bigger): - ret -# endif - - .p2align 4 -L(equal): - xor %eax, %eax - ret - -END (MEMCMP) -#endif diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S deleted file mode 100644 index a41ef95fc1..0000000000 --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_WMEMCMP 1 -#define MEMCMP __wmemcmp_ssse3 - -#include "memcmp-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 18:05 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein ` (4 subsequent siblings) 5 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - sysdeps/x86_64/multiarch/strcmp.c | 4 - sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- sysdeps/x86_64/multiarch/strncmp.c | 4 - sysdeps/x86_64/strcmp.S | 155 ++++-------------- 10 files changed, 30 insertions(+), 202 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 51222dfab1..ed2def288d 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -58,7 +58,6 @@ sysdep_routines += \ strcasecmp_l-evex \ strcasecmp_l-sse2 \ strcasecmp_l-sse4_2 \ - strcasecmp_l-ssse3 \ strcat-avx2 \ strcat-avx2-rtm \ strcat-evex \ @@ -80,7 +79,6 @@ sysdep_routines += \ strcmp-sse2 \ strcmp-sse2-unaligned \ strcmp-sse4_2 \ - strcmp-ssse3 \ strcpy-avx2 \ strcpy-avx2-rtm \ strcpy-evex \ @@ -98,7 +96,6 @@ sysdep_routines += \ strncase_l-evex \ strncase_l-sse2 \ strncase_l-sse4_2 \ - strncase_l-ssse3 \ strncat-avx2 \ strncat-avx2-rtm \ strncat-c \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncmp-evex \ strncmp-sse2 \ strncmp-sse4_2 \ - strncmp-ssse3 \ strncpy-avx2 \ strncpy-avx2-rtm \ strncpy-c \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index f389928a4e..7e2be3554b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strcasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strcasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strcasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strcasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, __strcasecmp_l_sse2)) @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strcmp_evex) IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), __strcmp_sse42) - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), - __strcmp_ssse3) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_sse2)) @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, strncasecmp_l, CPU_FEATURE_USABLE (SSE4_2), __strncasecmp_l_sse42) - IFUNC_IMPL_ADD (array, i, strncasecmp_l, - CPU_FEATURE_USABLE (SSSE3), - __strncasecmp_l_ssse3) IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, __strncasecmp_l_sse2)) @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __strncmp_evex) IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), __strncmp_sse42) - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), - __strncmp_ssse3) IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) #ifdef SHARED diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h index 766539c241..296d32071b 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h @@ -20,7 +20,6 @@ #include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S deleted file mode 100644 index fb2f9ae14a..0000000000 --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strcasecmp_l_ssse3 -#define __strcasecmp __strcasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S deleted file mode 100644 index 1b7fa33c91..0000000000 --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S +++ /dev/null @@ -1,5 +0,0 @@ -#if IS_IN (libc) -# define USE_SSSE3 1 -# define STRCMP __strcmp_ssse3 -# include "../strcmp.S" -#endif diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c index 68cb73baad..a248c2a6e6 100644 --- a/sysdeps/x86_64/multiarch/strcmp.c +++ b/sysdeps/x86_64/multiarch/strcmp.c @@ -28,7 +28,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S deleted file mode 100644 index 6728678688..0000000000 --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S +++ /dev/null @@ -1,6 +0,0 @@ -#define USE_SSSE3 1 -#define USE_AS_STRNCASECMP_L -#define NO_NOLOCALE_ALIAS -#define STRCMP __strncasecmp_l_ssse3 -#define __strncasecmp __strncasecmp_ssse3 -#include "../strcmp.S" diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S deleted file mode 100644 index ec37308347..0000000000 --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S +++ /dev/null @@ -1,28 +0,0 @@ -/* strcmp optimized with SSSE3. - Copyright (C) 2017-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#define STRCMP __strncmp_ssse3 - -#undef libc_hidden_builtin_def -#define libc_hidden_builtin_def(strcmp) - -#define USE_SSSE3 1 -#define USE_AS_STRNCMP -#include <sysdeps/x86_64/strcmp.S> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c index fca74199d8..70ae6547c9 100644 --- a/sysdeps/x86_64/multiarch/strncmp.c +++ b/sysdeps/x86_64/multiarch/strncmp.c @@ -27,7 +27,6 @@ # include <init-arch.h> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) return OPTIMIZE (sse42); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S index 99d8b36f1d..c38dc627f9 100644 --- a/sysdeps/x86_64/strcmp.S +++ b/sysdeps/x86_64/strcmp.S @@ -59,12 +59,7 @@ # endif #endif -#ifndef USE_SSSE3 .text -#else - .section .text.ssse3,"ax",@progbits -#endif - #ifdef USE_AS_STRCASECMP_L # ifndef ENTRY2 # define ENTRY2(name) ENTRY (name) @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 /* store for next cycle */ -#ifndef USE_SSSE3 psrldq $1, %xmm3 pslldq $15, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $2, %xmm3 pslldq $14, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $3, %xmm3 pslldq $13, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $4, %xmm3 pslldq $12, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $5, %xmm3 pslldq $11, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $6, %xmm3 pslldq $10, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $7, %xmm3 pslldq $9, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $8, %xmm3 pslldq $8, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $9, %xmm3 pslldq $7, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $10, %xmm3 pslldq $6, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $11, %xmm3 pslldq $5, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $12, %xmm3 pslldq $4, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $13, %xmm3 pslldq $3, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $14, %xmm3 pslldq $2, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): movdqa (%rdi, %rcx), %xmm2 movdqa %xmm2, %xmm4 -#ifndef USE_SSSE3 psrldq $15, %xmm3 pslldq $1, %xmm2 por %xmm3, %xmm2 /* merge into one 16byte value */ -#else - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ -#endif + TOLOWER (%xmm1, %xmm2) pcmpeqb %xmm1, %xmm0 -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-04-14 18:05 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-04-14 18:05 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 -- > sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - > sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 - > sysdeps/x86_64/multiarch/strcmp.c | 4 - > sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 - > sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ---- > sysdeps/x86_64/multiarch/strncmp.c | 4 - > sysdeps/x86_64/strcmp.S | 155 ++++-------------- > 10 files changed, 30 insertions(+), 202 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 51222dfab1..ed2def288d 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -58,7 +58,6 @@ sysdep_routines += \ > strcasecmp_l-evex \ > strcasecmp_l-sse2 \ > strcasecmp_l-sse4_2 \ > - strcasecmp_l-ssse3 \ > strcat-avx2 \ > strcat-avx2-rtm \ > strcat-evex \ > @@ -80,7 +79,6 @@ sysdep_routines += \ > strcmp-sse2 \ > strcmp-sse2-unaligned \ > strcmp-sse4_2 \ > - strcmp-ssse3 \ > strcpy-avx2 \ > strcpy-avx2-rtm \ > strcpy-evex \ > @@ -98,7 +96,6 @@ sysdep_routines += \ > strncase_l-evex \ > strncase_l-sse2 \ > strncase_l-sse4_2 \ > - strncase_l-ssse3 \ > strncat-avx2 \ > strncat-avx2-rtm \ > strncat-c \ > @@ -110,7 +107,6 @@ sysdep_routines += \ > strncmp-evex \ > strncmp-sse2 \ > strncmp-sse4_2 \ > - strncmp-ssse3 \ > strncpy-avx2 \ > strncpy-avx2-rtm \ > strncpy-c \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index f389928a4e..7e2be3554b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ > @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strcasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strcasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strcasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strcasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1, > __strcasecmp_l_sse2)) > > @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strcmp_evex) > IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), > __strcmp_sse42) > - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), > - __strcmp_ssse3) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2)) > > @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp, 1, > __strncasecmp_sse2)) > > @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, strncasecmp_l, > CPU_FEATURE_USABLE (SSE4_2), > __strncasecmp_l_sse42) > - IFUNC_IMPL_ADD (array, i, strncasecmp_l, > - CPU_FEATURE_USABLE (SSSE3), > - __strncasecmp_l_ssse3) > IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1, > __strncasecmp_l_sse2)) > > @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __strncmp_evex) > IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), > __strncmp_sse42) > - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), > - __strncmp_ssse3) > IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2)) > > #ifdef SHARED > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > index 766539c241..296d32071b 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h > @@ -20,7 +20,6 @@ > #include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > deleted file mode 100644 > index fb2f9ae14a..0000000000 > --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strcasecmp_l_ssse3 > -#define __strcasecmp __strcasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S > deleted file mode 100644 > index 1b7fa33c91..0000000000 > --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S > +++ /dev/null > @@ -1,5 +0,0 @@ > -#if IS_IN (libc) > -# define USE_SSSE3 1 > -# define STRCMP __strcmp_ssse3 > -# include "../strcmp.S" > -#endif > diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c > index 68cb73baad..a248c2a6e6 100644 > --- a/sysdeps/x86_64/multiarch/strcmp.c > +++ b/sysdeps/x86_64/multiarch/strcmp.c > @@ -28,7 +28,6 @@ > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > deleted file mode 100644 > index 6728678688..0000000000 > --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S > +++ /dev/null > @@ -1,6 +0,0 @@ > -#define USE_SSSE3 1 > -#define USE_AS_STRNCASECMP_L > -#define NO_NOLOCALE_ALIAS > -#define STRCMP __strncasecmp_l_ssse3 > -#define __strncasecmp __strncasecmp_ssse3 > -#include "../strcmp.S" > diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S > deleted file mode 100644 > index ec37308347..0000000000 > --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S > +++ /dev/null > @@ -1,28 +0,0 @@ > -/* strcmp optimized with SSSE3. > - Copyright (C) 2017-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#define STRCMP __strncmp_ssse3 > - > -#undef libc_hidden_builtin_def > -#define libc_hidden_builtin_def(strcmp) > - > -#define USE_SSSE3 1 > -#define USE_AS_STRNCMP > -#include <sysdeps/x86_64/strcmp.S> > diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c > index fca74199d8..70ae6547c9 100644 > --- a/sysdeps/x86_64/multiarch/strncmp.c > +++ b/sysdeps/x86_64/multiarch/strncmp.c > @@ -27,7 +27,6 @@ > # include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void) > && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) > return OPTIMIZE (sse42); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > > diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S > index 99d8b36f1d..c38dc627f9 100644 > --- a/sysdeps/x86_64/strcmp.S > +++ b/sysdeps/x86_64/strcmp.S > @@ -59,12 +59,7 @@ > # endif > #endif > > -#ifndef USE_SSSE3 > .text > -#else > - .section .text.ssse3,"ax",@progbits > -#endif > - > #ifdef USE_AS_STRCASECMP_L > # ifndef ENTRY2 > # define ENTRY2(name) ENTRY (name) > @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 /* store for next cycle */ > > -#ifndef USE_SSSE3 > psrldq $1, %xmm3 > pslldq $15, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $2, %xmm3 > pslldq $14, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $3, %xmm3 > pslldq $13, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $4, %xmm3 > pslldq $12, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $5, %xmm3 > pslldq $11, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $6, %xmm3 > pslldq $10, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $7, %xmm3 > pslldq $9, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $8, %xmm3 > pslldq $8, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $9, %xmm3 > pslldq $7, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $10, %xmm3 > pslldq $6, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $11, %xmm3 > pslldq $5, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $12, %xmm3 > pslldq $4, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $13, %xmm3 > pslldq $3, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $14, %xmm3 > pslldq $2, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15): > movdqa (%rdi, %rcx), %xmm2 > movdqa %xmm2, %xmm4 > > -#ifndef USE_SSSE3 > psrldq $15, %xmm3 > pslldq $1, %xmm2 > por %xmm3, %xmm2 /* merge into one 16byte value */ > -#else > - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */ > -#endif > + > TOLOWER (%xmm1, %xmm2) > > pcmpeqb %xmm1, %xmm0 > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 18:06 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein ` (3 subsequent siblings) 5 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - 5 files changed, 879 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ed2def288d..2b3c625ea2 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -63,7 +63,6 @@ sysdep_routines += \ strcat-evex \ strcat-sse2 \ strcat-sse2-unaligned \ - strcat-ssse3 \ strchr-avx2 \ strchr-avx2-rtm \ strchr-evex \ @@ -101,7 +100,6 @@ sysdep_routines += \ strncat-c \ strncat-evex \ strncat-sse2-unaligned \ - strncat-ssse3 \ strncmp-avx2 \ strncmp-avx2-rtm \ strncmp-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 7e2be3554b..41a04621ad 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcat_evex) - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), - __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) @@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncat_evex) - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), - __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 5bece38f78..a15afa44e9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -23,7 +23,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S deleted file mode 100644 index 9f39e4fcd1..0000000000 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ /dev/null @@ -1,866 +0,0 @@ -/* strcat with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_ssse3 -# endif - -# define USE_AS_STRCAT - -.text -ENTRY (STRCAT) -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - -/* Inline corresponding strlen file, temporary until new strcpy - implementation gets merged. */ - - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) - -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(exit) - -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(exit) - -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(exit) - -L(aligned_64_exit_16): - lea -48(%rax), %rax - -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail1): - add $1, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail2): - add $2, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail3): - add $3, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail4): - add $4, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail5): - add $5, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail6): - add $6, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail7): - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail8): - add $8, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail9): - add $9, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail10): - add $10, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail11): - add $11, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail12): - add $12, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail13): - add $13, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail14): - add $14, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail15): - add $15, %eax - - .p2align 4 -L(StartStrcpyPart): - mov %rsi, %rcx - lea (%rdi, %rax), %rdx -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(StrncatExit0) - cmp $8, %r8 - jbe L(StrncatExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) - cmpb $0, 8(%rcx) - jz L(Exit9) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - jb L(StrncatExit15Bytes) -# endif - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) - cmpb $0, 15(%rcx) - jz L(Exit16) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - je L(StrncatExit16) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-ssse3.S" - - .p2align 4 -L(CopyFrom1To16Bytes): - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit1): - xor %ah, %ah - movb %ah, 1(%rdx) -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit2): - xor %ah, %ah - movb %ah, 2(%rdx) -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit3): - xor %ah, %ah - movb %ah, 3(%rdx) -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit4): - xor %ah, %ah - movb %ah, 4(%rdx) -L(Exit4): - mov (%rcx), %eax - mov %eax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit5): - xor %ah, %ah - movb %ah, 5(%rdx) -L(Exit5): - mov (%rcx), %eax - mov %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit6): - xor %ah, %ah - movb %ah, 6(%rdx) -L(Exit6): - mov (%rcx), %eax - mov %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit7): - xor %ah, %ah - movb %ah, 7(%rdx) -L(Exit7): - mov (%rcx), %eax - mov %eax, (%rdx) - mov 3(%rcx), %eax - mov %eax, 3(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8): - xor %ah, %ah - movb %ah, 8(%rdx) -L(Exit8): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit9): - xor %ah, %ah - movb %ah, 9(%rdx) -L(Exit9): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movb 8(%rcx), %al - movb %al, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit10): - xor %ah, %ah - movb %ah, 10(%rdx) -L(Exit10): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movw 8(%rcx), %ax - movw %ax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit11): - xor %ah, %ah - movb %ah, 11(%rdx) -L(Exit11): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit12): - xor %ah, %ah - movb %ah, 12(%rdx) -L(Exit12): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit13): - xor %ah, %ah - movb %ah, 13(%rdx) -L(Exit13): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 5(%rcx), %xmm1 - movlpd %xmm1, 5(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit14): - xor %ah, %ah - movb %ah, 14(%rdx) -L(Exit14): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 6(%rcx), %xmm1 - movlpd %xmm1, 6(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15): - xor %ah, %ah - movb %ah, 15(%rdx) -L(Exit15): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit16): - xor %ah, %ah - movb %ah, 16(%rdx) -L(Exit16): - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - test $0x01, %al - jnz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - test $0x02, %al - jnz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - test $0x04, %al - jnz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - test $0x08, %al - jnz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - test $0x10, %al - jnz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - test $0x20, %al - jnz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - test $0x40, %al - jnz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase2): - test $0x01, %ah - jnz L(Exit9) - cmp $9, %r8 - je L(StrncatExit9) - test $0x02, %ah - jnz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - test $0x04, %ah - jnz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - test $0x8, %ah - jnz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - test $0x10, %ah - jnz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - test $0x20, %ah - jnz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - test $0x40, %ah - jnz L(Exit15) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $8, %r8 - ja L(ExitHighCase3) - cmp $1, %r8 - je L(StrncatExit1) - cmp $2, %r8 - je L(StrncatExit2) - cmp $3, %r8 - je L(StrncatExit3) - cmp $4, %r8 - je L(StrncatExit4) - cmp $5, %r8 - je L(StrncatExit5) - cmp $6, %r8 - je L(StrncatExit6) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - xor %ah, %ah - movb %ah, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase3): - cmp $9, %r8 - je L(StrncatExit9) - cmp $10, %r8 - je L(StrncatExit10) - cmp $11, %r8 - je L(StrncatExit11) - cmp $12, %r8 - je L(StrncatExit12) - cmp $13, %r8 - je L(StrncatExit13) - cmp $14, %r8 - je L(StrncatExit14) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - xor %ah, %ah - movb %ah, 16(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit0): - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15Bytes): - cmp $9, %r8 - je L(StrncatExit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8Bytes): - cmpb $0, (%rcx) - jz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - -# endif -END (STRCAT) -#endif diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S deleted file mode 100644 index 6c45ff3ec7..0000000000 --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCAT -#define STRCAT __strncat_ssse3 -#include "strcat-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-04-14 18:06 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-04-14 18:06 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - > sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- > sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - > 5 files changed, 879 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index ed2def288d..2b3c625ea2 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -63,7 +63,6 @@ sysdep_routines += \ > strcat-evex \ > strcat-sse2 \ > strcat-sse2-unaligned \ > - strcat-ssse3 \ > strchr-avx2 \ > strchr-avx2-rtm \ > strchr-evex \ > @@ -101,7 +100,6 @@ sysdep_routines += \ > strncat-c \ > strncat-evex \ > strncat-sse2-unaligned \ > - strncat-ssse3 \ > strncmp-avx2 \ > strncmp-avx2-rtm \ > strncmp-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 7e2be3554b..41a04621ad 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcat_evex) > - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), > - __strcat_ssse3) > IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) > > @@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncat_evex) > - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), > - __strncat_ssse3) > IFUNC_IMPL_ADD (array, i, strncat, 1, > __strncat_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > index 5bece38f78..a15afa44e9 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > @@ -23,7 +23,6 @@ > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S > deleted file mode 100644 > index 9f39e4fcd1..0000000000 > --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S > +++ /dev/null > @@ -1,866 +0,0 @@ > -/* strcat with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > - > -# ifndef STRCAT > -# define STRCAT __strcat_ssse3 > -# endif > - > -# define USE_AS_STRCAT > - > -.text > -ENTRY (STRCAT) > -# ifdef USE_AS_STRNCAT > - mov %rdx, %r8 > -# endif > - > - > -/* Inline corresponding strlen file, temporary until new strcpy > - implementation gets merged. */ > - > - xor %eax, %eax > - cmpb $0, (%rdi) > - jz L(exit_tail0) > - cmpb $0, 1(%rdi) > - jz L(exit_tail1) > - cmpb $0, 2(%rdi) > - jz L(exit_tail2) > - cmpb $0, 3(%rdi) > - jz L(exit_tail3) > - > - cmpb $0, 4(%rdi) > - jz L(exit_tail4) > - cmpb $0, 5(%rdi) > - jz L(exit_tail5) > - cmpb $0, 6(%rdi) > - jz L(exit_tail6) > - cmpb $0, 7(%rdi) > - jz L(exit_tail7) > - > - cmpb $0, 8(%rdi) > - jz L(exit_tail8) > - cmpb $0, 9(%rdi) > - jz L(exit_tail9) > - cmpb $0, 10(%rdi) > - jz L(exit_tail10) > - cmpb $0, 11(%rdi) > - jz L(exit_tail11) > - > - cmpb $0, 12(%rdi) > - jz L(exit_tail12) > - cmpb $0, 13(%rdi) > - jz L(exit_tail13) > - cmpb $0, 14(%rdi) > - jz L(exit_tail14) > - cmpb $0, 15(%rdi) > - jz L(exit_tail15) > - pxor %xmm0, %xmm0 > - lea 16(%rdi), %rcx > - lea 16(%rdi), %rax > - and $-16, %rax > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - pxor %xmm1, %xmm1 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - pxor %xmm2, %xmm2 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - pxor %xmm3, %xmm3 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - and $-0x40, %rax > - > - .p2align 4 > -L(aligned_64): > - pcmpeqb (%rax), %xmm0 > - pcmpeqb 16(%rax), %xmm1 > - pcmpeqb 32(%rax), %xmm2 > - pcmpeqb 48(%rax), %xmm3 > - pmovmskb %xmm0, %edx > - pmovmskb %xmm1, %r11d > - pmovmskb %xmm2, %r10d > - pmovmskb %xmm3, %r9d > - or %edx, %r9d > - or %r11d, %r9d > - or %r10d, %r9d > - lea 64(%rax), %rax > - jz L(aligned_64) > - > - test %edx, %edx > - jnz L(aligned_64_exit_16) > - test %r11d, %r11d > - jnz L(aligned_64_exit_32) > - test %r10d, %r10d > - jnz L(aligned_64_exit_48) > - > -L(aligned_64_exit_64): > - pmovmskb %xmm3, %edx > - jmp L(exit) > - > -L(aligned_64_exit_48): > - lea -16(%rax), %rax > - mov %r10d, %edx > - jmp L(exit) > - > -L(aligned_64_exit_32): > - lea -32(%rax), %rax > - mov %r11d, %edx > - jmp L(exit) > - > -L(aligned_64_exit_16): > - lea -48(%rax), %rax > - > -L(exit): > - sub %rcx, %rax > - test %dl, %dl > - jz L(exit_high) > - test $0x01, %dl > - jnz L(exit_tail0) > - > - test $0x02, %dl > - jnz L(exit_tail1) > - > - test $0x04, %dl > - jnz L(exit_tail2) > - > - test $0x08, %dl > - jnz L(exit_tail3) > - > - test $0x10, %dl > - jnz L(exit_tail4) > - > - test $0x20, %dl > - jnz L(exit_tail5) > - > - test $0x40, %dl > - jnz L(exit_tail6) > - add $7, %eax > -L(exit_tail0): > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_high): > - add $8, %eax > - test $0x01, %dh > - jnz L(exit_tail0) > - > - test $0x02, %dh > - jnz L(exit_tail1) > - > - test $0x04, %dh > - jnz L(exit_tail2) > - > - test $0x08, %dh > - jnz L(exit_tail3) > - > - test $0x10, %dh > - jnz L(exit_tail4) > - > - test $0x20, %dh > - jnz L(exit_tail5) > - > - test $0x40, %dh > - jnz L(exit_tail6) > - add $7, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail1): > - add $1, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail2): > - add $2, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail3): > - add $3, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail4): > - add $4, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail5): > - add $5, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail6): > - add $6, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail7): > - add $7, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail8): > - add $8, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail9): > - add $9, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail10): > - add $10, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail11): > - add $11, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail12): > - add $12, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail13): > - add $13, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail14): > - add $14, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail15): > - add $15, %eax > - > - .p2align 4 > -L(StartStrcpyPart): > - mov %rsi, %rcx > - lea (%rdi, %rax), %rdx > -# ifdef USE_AS_STRNCAT > - test %r8, %r8 > - jz L(StrncatExit0) > - cmp $8, %r8 > - jbe L(StrncatExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > -# ifdef USE_AS_STRNCAT > - cmp $16, %r8 > - jb L(StrncatExit15Bytes) > -# endif > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# ifdef USE_AS_STRNCAT > - cmp $16, %r8 > - je L(StrncatExit16) > -# define USE_AS_STRNCPY > -# endif > - > -# include "strcpy-ssse3.S" > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - movlpd (%rcx), %xmm0 > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm0, (%rdx) > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit1): > - xor %ah, %ah > - movb %ah, 1(%rdx) > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit2): > - xor %ah, %ah > - movb %ah, 2(%rdx) > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit3): > - xor %ah, %ah > - movb %ah, 3(%rdx) > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit4): > - xor %ah, %ah > - movb %ah, 4(%rdx) > -L(Exit4): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit5): > - xor %ah, %ah > - movb %ah, 5(%rdx) > -L(Exit5): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit6): > - xor %ah, %ah > - movb %ah, 6(%rdx) > -L(Exit6): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit7): > - xor %ah, %ah > - movb %ah, 7(%rdx) > -L(Exit7): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - mov 3(%rcx), %eax > - mov %eax, 3(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit8): > - xor %ah, %ah > - movb %ah, 8(%rdx) > -L(Exit8): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit9): > - xor %ah, %ah > - movb %ah, 9(%rdx) > -L(Exit9): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movb 8(%rcx), %al > - movb %al, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit10): > - xor %ah, %ah > - movb %ah, 10(%rdx) > -L(Exit10): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movw 8(%rcx), %ax > - movw %ax, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit11): > - xor %ah, %ah > - movb %ah, 11(%rdx) > -L(Exit11): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit12): > - xor %ah, %ah > - movb %ah, 12(%rdx) > -L(Exit12): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit13): > - xor %ah, %ah > - movb %ah, 13(%rdx) > -L(Exit13): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 5(%rcx), %xmm1 > - movlpd %xmm1, 5(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit14): > - xor %ah, %ah > - movb %ah, 14(%rdx) > -L(Exit14): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 6(%rcx), %xmm1 > - movlpd %xmm1, 6(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit15): > - xor %ah, %ah > - movb %ah, 15(%rdx) > -L(Exit15): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 7(%rcx), %xmm1 > - movlpd %xmm1, 7(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit16): > - xor %ah, %ah > - movb %ah, 16(%rdx) > -L(Exit16): > - movlpd (%rcx), %xmm0 > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm0, (%rdx) > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - test $0x01, %al > - jnz L(Exit1) > - cmp $1, %r8 > - je L(StrncatExit1) > - test $0x02, %al > - jnz L(Exit2) > - cmp $2, %r8 > - je L(StrncatExit2) > - test $0x04, %al > - jnz L(Exit3) > - cmp $3, %r8 > - je L(StrncatExit3) > - test $0x08, %al > - jnz L(Exit4) > - cmp $4, %r8 > - je L(StrncatExit4) > - test $0x10, %al > - jnz L(Exit5) > - cmp $5, %r8 > - je L(StrncatExit5) > - test $0x20, %al > - jnz L(Exit6) > - cmp $6, %r8 > - je L(StrncatExit6) > - test $0x40, %al > - jnz L(Exit7) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHighCase2): > - test $0x01, %ah > - jnz L(Exit9) > - cmp $9, %r8 > - je L(StrncatExit9) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $10, %r8 > - je L(StrncatExit10) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $11, %r8 > - je L(StrncatExit11) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $12, %r8 > - je L(StrncatExit12) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $13, %r8 > - je L(StrncatExit13) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $14, %r8 > - je L(StrncatExit14) > - test $0x40, %ah > - jnz L(Exit15) > - cmp $15, %r8 > - je L(StrncatExit15) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $8, %r8 > - ja L(ExitHighCase3) > - cmp $1, %r8 > - je L(StrncatExit1) > - cmp $2, %r8 > - je L(StrncatExit2) > - cmp $3, %r8 > - je L(StrncatExit3) > - cmp $4, %r8 > - je L(StrncatExit4) > - cmp $5, %r8 > - je L(StrncatExit5) > - cmp $6, %r8 > - je L(StrncatExit6) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - xor %ah, %ah > - movb %ah, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHighCase3): > - cmp $9, %r8 > - je L(StrncatExit9) > - cmp $10, %r8 > - je L(StrncatExit10) > - cmp $11, %r8 > - je L(StrncatExit11) > - cmp $12, %r8 > - je L(StrncatExit12) > - cmp $13, %r8 > - je L(StrncatExit13) > - cmp $14, %r8 > - je L(StrncatExit14) > - cmp $15, %r8 > - je L(StrncatExit15) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm1, 8(%rdx) > - xor %ah, %ah > - movb %ah, 16(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit0): > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit15Bytes): > - cmp $9, %r8 > - je L(StrncatExit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $10, %r8 > - je L(StrncatExit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $11, %r8 > - je L(StrncatExit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $12, %r8 > - je L(StrncatExit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $13, %r8 > - je L(StrncatExit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmp $14, %r8 > - je L(StrncatExit14) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 7(%rcx), %xmm1 > - movlpd %xmm1, 7(%rdx) > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit8Bytes): > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $1, %r8 > - je L(StrncatExit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $2, %r8 > - je L(StrncatExit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $3, %r8 > - je L(StrncatExit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $4, %r8 > - je L(StrncatExit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $5, %r8 > - je L(StrncatExit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $6, %r8 > - je L(StrncatExit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > -# endif > -END (STRCAT) > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S > deleted file mode 100644 > index 6c45ff3ec7..0000000000 > --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCAT > -#define STRCAT __strncat_ssse3 > -#include "strcat-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 18:10 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein ` (2 subsequent siblings) 5 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - 6 files changed, 3572 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 2b3c625ea2..5b02ec8de5 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -46,13 +46,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -83,7 +81,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -110,7 +107,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 41a04621ad..49ce6860d0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein @ 2022-04-14 18:10 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-04-14 18:10 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - > sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - > sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - > sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- > sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - > 6 files changed, 3572 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 2b3c625ea2..5b02ec8de5 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -46,13 +46,11 @@ sysdep_routines += \ > stpcpy-evex \ > stpcpy-sse2 \ > stpcpy-sse2-unaligned \ > - stpcpy-ssse3 \ > stpncpy-avx2 \ > stpncpy-avx2-rtm \ > stpncpy-c \ > stpncpy-evex \ > stpncpy-sse2-unaligned \ > - stpncpy-ssse3 \ > strcasecmp_l-avx2 \ > strcasecmp_l-avx2-rtm \ > strcasecmp_l-evex \ > @@ -83,7 +81,6 @@ sysdep_routines += \ > strcpy-evex \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > - strcpy-ssse3 \ > strcspn-c \ > strcspn-sse2 \ > strlen-avx2 \ > @@ -110,7 +107,6 @@ sysdep_routines += \ > strncpy-c \ > strncpy-evex \ > strncpy-sse2-unaligned \ > - strncpy-ssse3 \ > strnlen-avx2 \ > strnlen-avx2-rtm \ > strnlen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 41a04621ad..49ce6860d0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ > IFUNC_IMPL (i, name, stpncpy, > - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), > - __stpncpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), > __stpncpy_avx2) > IFUNC_IMPL_ADD (array, i, stpncpy, > @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > IFUNC_IMPL (i, name, stpcpy, > - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), > - __stpcpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), > __stpcpy_avx2) > IFUNC_IMPL_ADD (array, i, stpcpy, > @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcpy_evex) > - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), > - __strcpy_ssse3) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) > > @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncpy_evex) > - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), > - __strncpy_ssse3) > IFUNC_IMPL_ADD (array, i, strncpy, 1, > __strncpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > deleted file mode 100644 > index d971c2da38..0000000000 > --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STPCPY > -#define STRCPY __stpcpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > deleted file mode 100644 > index 14ed16f6b5..0000000000 > --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY __stpncpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S > deleted file mode 100644 > index f617a535cf..0000000000 > --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S > +++ /dev/null > @@ -1,3550 +0,0 @@ > -/* strcpy with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > - > -# ifndef STRCPY > -# define STRCPY __strcpy_ssse3 > -# endif > - > - .section .text.ssse3,"ax",@progbits > -ENTRY (STRCPY) > - > - mov %rsi, %rcx > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > -# endif > - mov %rdi, %rdx > -# ifdef USE_AS_STRNCPY > - test %R8_LP, %R8_LP > - jz L(Exit0) > - cmp $8, %R8_LP > - jbe L(StrncpyExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - jb L(StrncpyExit15Bytes) > -# endif > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - je L(Exit16) > -# endif > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# endif > - > -# ifdef USE_AS_STRNCPY > - mov %rcx, %rsi > - sub $16, %r8 > - and $0xf, %rsi > - > -/* add 16 bytes rcx_offset to r8 */ > - > - add %rsi, %r8 > -# endif > - lea 16(%rcx), %rsi > - and $-16, %rsi > - pxor %xmm0, %xmm0 > - mov (%rcx), %r9 > - mov %r9, (%rdx) > - pcmpeqb (%rsi), %xmm0 > - mov 8(%rcx), %r9 > - mov %r9, 8(%rdx) > - > -/* convert byte mask in xmm0 to bit mask */ > - > - pmovmskb %xmm0, %rax > - sub %rcx, %rsi > - > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - mov %rdx, %rax > - lea 16(%rdx), %rdx > - and $-16, %rdx > - sub %rdx, %rax > - > -# ifdef USE_AS_STRNCPY > - add %rax, %rsi > - lea -1(%rsi), %rsi > - and $1<<31, %esi > - test %rsi, %rsi > - jnz L(ContinueCopy) > - lea 16(%r8), %r8 > - > -L(ContinueCopy): > -# endif > - sub %rax, %rcx > - mov %rcx, %rax > - and $0xf, %rax > - mov $0, %rsi > - > -/* case: rcx_offset == rdx_offset */ > - > - jz L(Align16Both) > - > - cmp $8, %rax > - jae L(ShlHigh8) > - cmp $1, %rax > - je L(Shl1) > - cmp $2, %rax > - je L(Shl2) > - cmp $3, %rax > - je L(Shl3) > - cmp $4, %rax > - je L(Shl4) > - cmp $5, %rax > - je L(Shl5) > - cmp $6, %rax > - je L(Shl6) > - jmp L(Shl7) > - > -L(ShlHigh8): > - je L(Shl8) > - cmp $9, %rax > - je L(Shl9) > - cmp $10, %rax > - je L(Shl10) > - cmp $11, %rax > - je L(Shl11) > - cmp $12, %rax > - je L(Shl12) > - cmp $13, %rax > - je L(Shl13) > - cmp $14, %rax > - je L(Shl14) > - jmp L(Shl15) > - > -L(Align16Both): > - movaps (%rcx), %xmm1 > - movaps 16(%rcx), %xmm2 > - movaps %xmm1, (%rdx) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm4 > - movaps %xmm3, (%rdx, %rsi) > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm1 > - movaps %xmm4, (%rdx, %rsi) > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm2 > - movaps %xmm1, (%rdx, %rsi) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm3, (%rdx, %rsi) > - mov %rcx, %rax > - lea 16(%rcx, %rsi), %rcx > - and $-0x40, %rcx > - sub %rcx, %rax > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - lea 112(%r8, %rax), %r8 > -# endif > - mov $-0x40, %rsi > - > - .p2align 4 > -L(Aligned64Loop): > - movaps (%rcx), %xmm2 > - movaps %xmm2, %xmm4 > - movaps 16(%rcx), %xmm5 > - movaps 32(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 48(%rcx), %xmm7 > - pminub %xmm5, %xmm2 > - pminub %xmm7, %xmm3 > - pminub %xmm2, %xmm3 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %rax > - lea 64(%rdx), %rdx > - lea 64(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeaveCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Aligned64Leave) > - movaps %xmm4, -64(%rdx) > - movaps %xmm5, -48(%rdx) > - movaps %xmm6, -32(%rdx) > - movaps %xmm7, -16(%rdx) > - jmp L(Aligned64Loop) > - > -L(Aligned64Leave): > -# ifdef USE_AS_STRNCPY > - lea 48(%r8), %r8 > -# endif > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm6, -32(%rdx) > - pcmpeqb %xmm7, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl1): > - movaps -1(%rcx), %xmm1 > - movaps 15(%rcx), %xmm2 > -L(Shl1Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 31(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -15(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -1(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl1LoopStart): > - movaps 15(%rcx), %xmm2 > - movaps 31(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 47(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 63(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $1, %xmm4, %xmm5 > - test %rax, %rax > - palignr $1, %xmm3, %xmm4 > - jnz L(Shl1Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave1) > -# endif > - palignr $1, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $1, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl1LoopStart) > - > -L(Shl1LoopExit): > - movdqu -1(%rcx), %xmm1 > - mov $15, %rsi > - movdqu %xmm1, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl2): > - movaps -2(%rcx), %xmm1 > - movaps 14(%rcx), %xmm2 > -L(Shl2Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 30(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -14(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -2(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl2LoopStart): > - movaps 14(%rcx), %xmm2 > - movaps 30(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 46(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 62(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $2, %xmm4, %xmm5 > - test %rax, %rax > - palignr $2, %xmm3, %xmm4 > - jnz L(Shl2Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave2) > -# endif > - palignr $2, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $2, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl2LoopStart) > - > -L(Shl2LoopExit): > - movdqu -2(%rcx), %xmm1 > - mov $14, %rsi > - movdqu %xmm1, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl3): > - movaps -3(%rcx), %xmm1 > - movaps 13(%rcx), %xmm2 > -L(Shl3Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 29(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -13(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -3(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl3LoopStart): > - movaps 13(%rcx), %xmm2 > - movaps 29(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 45(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 61(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $3, %xmm4, %xmm5 > - test %rax, %rax > - palignr $3, %xmm3, %xmm4 > - jnz L(Shl3Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave3) > -# endif > - palignr $3, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $3, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl3LoopStart) > - > -L(Shl3LoopExit): > - movdqu -3(%rcx), %xmm1 > - mov $13, %rsi > - movdqu %xmm1, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl4): > - movaps -4(%rcx), %xmm1 > - movaps 12(%rcx), %xmm2 > -L(Shl4Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 28(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -12(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -4(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl4LoopStart): > - movaps 12(%rcx), %xmm2 > - movaps 28(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 44(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 60(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $4, %xmm4, %xmm5 > - test %rax, %rax > - palignr $4, %xmm3, %xmm4 > - jnz L(Shl4Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave4) > -# endif > - palignr $4, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $4, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl4LoopStart) > - > -L(Shl4LoopExit): > - movdqu -4(%rcx), %xmm1 > - mov $12, %rsi > - movdqu %xmm1, -4(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl5): > - movaps -5(%rcx), %xmm1 > - movaps 11(%rcx), %xmm2 > -L(Shl5Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 27(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -11(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -5(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl5LoopStart): > - movaps 11(%rcx), %xmm2 > - movaps 27(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 43(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 59(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $5, %xmm4, %xmm5 > - test %rax, %rax > - palignr $5, %xmm3, %xmm4 > - jnz L(Shl5Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave5) > -# endif > - palignr $5, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $5, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl5LoopStart) > - > -L(Shl5LoopExit): > - movdqu -5(%rcx), %xmm1 > - mov $11, %rsi > - movdqu %xmm1, -5(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl6): > - movaps -6(%rcx), %xmm1 > - movaps 10(%rcx), %xmm2 > -L(Shl6Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 26(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -10(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -6(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl6LoopStart): > - movaps 10(%rcx), %xmm2 > - movaps 26(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 42(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 58(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $6, %xmm4, %xmm5 > - test %rax, %rax > - palignr $6, %xmm3, %xmm4 > - jnz L(Shl6Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave6) > -# endif > - palignr $6, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $6, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl6LoopStart) > - > -L(Shl6LoopExit): > - mov (%rcx), %r9 > - mov 6(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 6(%rdx) > - mov $10, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl7): > - movaps -7(%rcx), %xmm1 > - movaps 9(%rcx), %xmm2 > -L(Shl7Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 25(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -9(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -7(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl7LoopStart): > - movaps 9(%rcx), %xmm2 > - movaps 25(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 41(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 57(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $7, %xmm4, %xmm5 > - test %rax, %rax > - palignr $7, %xmm3, %xmm4 > - jnz L(Shl7Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave7) > -# endif > - palignr $7, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $7, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl7LoopStart) > - > -L(Shl7LoopExit): > - mov (%rcx), %r9 > - mov 5(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 5(%rdx) > - mov $9, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl8): > - movaps -8(%rcx), %xmm1 > - movaps 8(%rcx), %xmm2 > -L(Shl8Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 24(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -8(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -8(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl8LoopStart): > - movaps 8(%rcx), %xmm2 > - movaps 24(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 40(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 56(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $8, %xmm4, %xmm5 > - test %rax, %rax > - palignr $8, %xmm3, %xmm4 > - jnz L(Shl8Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave8) > -# endif > - palignr $8, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $8, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl8LoopStart) > - > -L(Shl8LoopExit): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl9): > - movaps -9(%rcx), %xmm1 > - movaps 7(%rcx), %xmm2 > -L(Shl9Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 23(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -7(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -9(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl9LoopStart): > - movaps 7(%rcx), %xmm2 > - movaps 23(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 39(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 55(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $9, %xmm4, %xmm5 > - test %rax, %rax > - palignr $9, %xmm3, %xmm4 > - jnz L(Shl9Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave9) > -# endif > - palignr $9, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $9, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl9LoopStart) > - > -L(Shl9LoopExit): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl10): > - movaps -10(%rcx), %xmm1 > - movaps 6(%rcx), %xmm2 > -L(Shl10Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 22(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -6(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -10(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl10LoopStart): > - movaps 6(%rcx), %xmm2 > - movaps 22(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 38(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 54(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $10, %xmm4, %xmm5 > - test %rax, %rax > - palignr $10, %xmm3, %xmm4 > - jnz L(Shl10Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave10) > -# endif > - palignr $10, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $10, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl10LoopStart) > - > -L(Shl10LoopExit): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl11): > - movaps -11(%rcx), %xmm1 > - movaps 5(%rcx), %xmm2 > -L(Shl11Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 21(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -5(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -11(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl11LoopStart): > - movaps 5(%rcx), %xmm2 > - movaps 21(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 37(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 53(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $11, %xmm4, %xmm5 > - test %rax, %rax > - palignr $11, %xmm3, %xmm4 > - jnz L(Shl11Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave11) > -# endif > - palignr $11, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $11, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl11LoopStart) > - > -L(Shl11LoopExit): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl12): > - movaps -12(%rcx), %xmm1 > - movaps 4(%rcx), %xmm2 > -L(Shl12Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 20(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -4(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -12(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl12LoopStart): > - movaps 4(%rcx), %xmm2 > - movaps 20(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 36(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 52(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $12, %xmm4, %xmm5 > - test %rax, %rax > - palignr $12, %xmm3, %xmm4 > - jnz L(Shl12Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave12) > -# endif > - palignr $12, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $12, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl12LoopStart) > - > -L(Shl12LoopExit): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl13): > - movaps -13(%rcx), %xmm1 > - movaps 3(%rcx), %xmm2 > -L(Shl13Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 19(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -3(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -13(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl13LoopStart): > - movaps 3(%rcx), %xmm2 > - movaps 19(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 35(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 51(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $13, %xmm4, %xmm5 > - test %rax, %rax > - palignr $13, %xmm3, %xmm4 > - jnz L(Shl13Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave13) > -# endif > - palignr $13, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $13, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl13LoopStart) > - > -L(Shl13LoopExit): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl14): > - movaps -14(%rcx), %xmm1 > - movaps 2(%rcx), %xmm2 > -L(Shl14Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 18(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -2(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -14(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl14LoopStart): > - movaps 2(%rcx), %xmm2 > - movaps 18(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 34(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 50(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $14, %xmm4, %xmm5 > - test %rax, %rax > - palignr $14, %xmm3, %xmm4 > - jnz L(Shl14Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave14) > -# endif > - palignr $14, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $14, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl14LoopStart) > - > -L(Shl14LoopExit): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl15): > - movaps -15(%rcx), %xmm1 > - movaps 1(%rcx), %xmm2 > -L(Shl15Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 17(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -1(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -15(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl15LoopStart): > - movaps 1(%rcx), %xmm2 > - movaps 17(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 33(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 49(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $15, %xmm4, %xmm5 > - test %rax, %rax > - palignr $15, %xmm3, %xmm4 > - jnz L(Shl15Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave15) > -# endif > - palignr $15, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $15, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl15LoopStart) > - > -L(Shl15LoopExit): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > -# ifdef USE_AS_STRCAT > - jmp L(CopyFrom1To16Bytes) > -# endif > - > -# ifndef USE_AS_STRCAT > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > -# ifdef USE_AS_STRNCPY > - add $16, %r8 > -# endif > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - > - .p2align 4 > -L(Exit8): > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $8, %r8 > - lea 8(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - > - .p2align 4 > -L(Exit16): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %rax > - mov %rax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 15(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - lea 16(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - cmp $1, %r8 > - je L(Exit1) > - test $0x01, %al > - jnz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - test $0x02, %al > - jnz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - test $0x04, %al > - jnz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - test $0x08, %al > - jnz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - test $0x10, %al > - jnz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - test $0x20, %al > - jnz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - test $0x40, %al > - jnz L(Exit7) > - jmp L(Exit8) > - > - .p2align 4 > -L(ExitHighCase2): > - cmp $9, %r8 > - je L(Exit9) > - test $0x01, %ah > - jnz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $15, %r8 > - je L(Exit15) > - test $0x40, %ah > - jnz L(Exit15) > - jmp L(Exit16) > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $16, %r8 > - je L(Exit16) > - cmp $8, %r8 > - je L(Exit8) > - jg L(More8Case3) > - cmp $4, %r8 > - je L(Exit4) > - jg L(More4Case3) > - cmp $2, %r8 > - jl L(Exit1) > - je L(Exit2) > - jg L(Exit3) > -L(More8Case3): /* but less than 16 */ > - cmp $12, %r8 > - je L(Exit12) > - jl L(Less12Case3) > - cmp $14, %r8 > - jl L(Exit13) > - je L(Exit14) > - jg L(Exit15) > -L(More4Case3): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Exit5) > - je L(Exit6) > - jg L(Exit7) > -L(Less12Case3): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Exit9) > - je L(Exit10) > - jg L(Exit11) > -# endif > - > - .p2align 4 > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $1, %r8 > - lea 1(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 1(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $2, %r8 > - lea 2(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > -# ifdef USE_AS_STPCPY > - lea 2(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $3, %r8 > - lea 3(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit4): > - movl (%rcx), %eax > - movl %eax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 3(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $4, %r8 > - lea 4(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit5): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 4(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $5, %r8 > - lea 5(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit6): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 5(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $6, %r8 > - lea 6(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit7): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movl 3(%rcx), %eax > - movl %eax, 3(%rdx) > -# ifdef USE_AS_STPCPY > - lea 6(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $7, %r8 > - lea 7(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit9): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %eax > - mov %eax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 8(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $9, %r8 > - lea 9(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit10): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %eax > - mov %eax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 9(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $10, %r8 > - lea 10(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit11): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 10(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $11, %r8 > - lea 11(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit12): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 11(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $12, %r8 > - lea 12(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit13): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %rax > - mov %rax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 12(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $13, %r8 > - lea 13(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit14): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %rax > - mov %rax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 13(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $14, %r8 > - lea 14(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit15): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $15, %r8 > - lea 15(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(Fill0): > - ret > - > - .p2align 4 > -L(Fill1): > - movb %dl, (%rcx) > - ret > - > - .p2align 4 > -L(Fill2): > - movw %dx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill3): > - movw %dx, (%rcx) > - movb %dl, 2(%rcx) > - ret > - > - .p2align 4 > -L(Fill4): > - movl %edx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill5): > - movl %edx, (%rcx) > - movb %dl, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill6): > - movl %edx, (%rcx) > - movw %dx, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill7): > - movl %edx, (%rcx) > - movl %edx, 3(%rcx) > - ret > - > - .p2align 4 > -L(Fill8): > - mov %rdx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill9): > - mov %rdx, (%rcx) > - movb %dl, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill10): > - mov %rdx, (%rcx) > - movw %dx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill11): > - mov %rdx, (%rcx) > - movl %edx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill12): > - mov %rdx, (%rcx) > - movl %edx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill13): > - mov %rdx, (%rcx) > - mov %rdx, 5(%rcx) > - ret > - > - .p2align 4 > -L(Fill14): > - mov %rdx, (%rcx) > - mov %rdx, 6(%rcx) > - ret > - > - .p2align 4 > -L(Fill15): > - mov %rdx, (%rcx) > - mov %rdx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill16): > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - ret > - > - .p2align 4 > -L(StrncpyFillExit1): > - lea 16(%r8), %r8 > -L(FillFrom1To16Bytes): > - test %r8, %r8 > - jz L(Fill0) > - cmp $16, %r8 > - je L(Fill16) > - cmp $8, %r8 > - je L(Fill8) > - jg L(FillMore8) > - cmp $4, %r8 > - je L(Fill4) > - jg L(FillMore4) > - cmp $2, %r8 > - jl L(Fill1) > - je L(Fill2) > - jg L(Fill3) > -L(FillMore8): /* but less than 16 */ > - cmp $12, %r8 > - je L(Fill12) > - jl L(FillLess12) > - cmp $14, %r8 > - jl L(Fill13) > - je L(Fill14) > - jg L(Fill15) > -L(FillMore4): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Fill5) > - je L(Fill6) > - jg L(Fill7) > -L(FillLess12): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Fill9) > - je L(Fill10) > - jmp L(Fill11) > - > - .p2align 4 > -L(StrncpyFillTailWithZero1): > - xor %rdx, %rdx > - sub $16, %r8 > - jbe L(StrncpyFillExit1) > - > - pxor %xmm0, %xmm0 > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - > - lea 16(%rcx), %rcx > - > - mov %rcx, %rdx > - and $0xf, %rdx > - sub %rdx, %rcx > - add %rdx, %r8 > - xor %rdx, %rdx > - sub $64, %r8 > - jb L(StrncpyFillLess64) > - > -L(StrncpyFillLoopMovdqa): > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - movdqa %xmm0, 32(%rcx) > - movdqa %xmm0, 48(%rcx) > - lea 64(%rcx), %rcx > - sub $64, %r8 > - jae L(StrncpyFillLoopMovdqa) > - > -L(StrncpyFillLess64): > - add $32, %r8 > - jl L(StrncpyFillLess32) > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - lea 32(%rcx), %rcx > - sub $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > -L(StrncpyFillLess32): > - add $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > - .p2align 4 > -L(Exit0): > - mov %rdx, %rax > - ret > - > - .p2align 4 > -L(StrncpyExit15Bytes): > - cmp $9, %r8 > - je L(Exit9) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > - .p2align 4 > -L(StrncpyExit8Bytes): > - cmp $1, %r8 > - je L(Exit1) > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > -# endif > -# endif > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(StrncpyLeaveCase2OrCase3): > - test %rax, %rax > - jnz L(Aligned64LeaveCase2) > - > -L(Aligned64LeaveCase3): > - lea 64(%r8), %r8 > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase3) > - > -L(Aligned64LeaveCase2): > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - add $48, %r8 > - jle L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase2) > -/*--------------------------------------------------*/ > - .p2align 4 > -L(StrncpyExit1Case2OrCase3): > - movdqu -1(%rcx), %xmm0 > - movdqu %xmm0, -1(%rdx) > - mov $15, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit2Case2OrCase3): > - movdqu -2(%rcx), %xmm0 > - movdqu %xmm0, -2(%rdx) > - mov $14, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit3Case2OrCase3): > - movdqu -3(%rcx), %xmm0 > - movdqu %xmm0, -3(%rdx) > - mov $13, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit4Case2OrCase3): > - movdqu -4(%rcx), %xmm0 > - movdqu %xmm0, -4(%rdx) > - mov $12, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit5Case2OrCase3): > - movdqu -5(%rcx), %xmm0 > - movdqu %xmm0, -5(%rdx) > - mov $11, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit6Case2OrCase3): > - mov (%rcx), %rsi > - mov 6(%rcx), %r9d > - mov %r9d, 6(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $10, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit7Case2OrCase3): > - mov (%rcx), %rsi > - mov 5(%rcx), %r9d > - mov %r9d, 5(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $9, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit8Case2OrCase3): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit9Case2OrCase3): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit10Case2OrCase3): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit11Case2OrCase3): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit12Case2OrCase3): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit13Case2OrCase3): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit14Case2OrCase3): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit15Case2OrCase3): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave1): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit1) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit1): > - lea 15(%rdx, %rsi), %rdx > - lea 15(%rcx, %rsi), %rcx > - mov -15(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -15(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave2): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit2) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit2): > - lea 14(%rdx, %rsi), %rdx > - lea 14(%rcx, %rsi), %rcx > - mov -14(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -14(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave3): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit3) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit3): > - lea 13(%rdx, %rsi), %rdx > - lea 13(%rcx, %rsi), %rcx > - mov -13(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -13(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave4): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit4) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit4): > - lea 12(%rdx, %rsi), %rdx > - lea 12(%rcx, %rsi), %rcx > - mov -12(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -12(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave5): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit5) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit5): > - lea 11(%rdx, %rsi), %rdx > - lea 11(%rcx, %rsi), %rcx > - mov -11(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -11(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave6): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit6) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit6): > - lea 10(%rdx, %rsi), %rdx > - lea 10(%rcx, %rsi), %rcx > - mov -10(%rcx), %rsi > - movw -2(%rcx), %ax > - mov %rsi, -10(%rdx) > - movw %ax, -2(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave7): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit7) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit7): > - lea 9(%rdx, %rsi), %rdx > - lea 9(%rcx, %rsi), %rcx > - mov -9(%rcx), %rsi > - movb -1(%rcx), %ah > - mov %rsi, -9(%rdx) > - movb %ah, -1(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave8): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit8) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit8): > - lea 8(%rdx, %rsi), %rdx > - lea 8(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave9): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit9) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit9): > - lea 7(%rdx, %rsi), %rdx > - lea 7(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave10): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit10) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit10): > - lea 6(%rdx, %rsi), %rdx > - lea 6(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave11): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit11) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit11): > - lea 5(%rdx, %rsi), %rdx > - lea 5(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave12): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit12) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit12): > - lea 4(%rdx, %rsi), %rdx > - lea 4(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave13): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit13) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit13): > - lea 3(%rdx, %rsi), %rdx > - lea 3(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave14): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit14) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit14): > - lea 2(%rdx, %rsi), %rdx > - lea 2(%rcx, %rsi), %rcx > - movw -2(%rcx), %ax > - xor %rsi, %rsi > - movw %ax, -2(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave15): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit15) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit15): > - lea 1(%rdx, %rsi), %rdx > - lea 1(%rcx, %rsi), %rcx > - movb -1(%rcx), %ah > - xor %rsi, %rsi > - movb %ah, -1(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > -# endif > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# endif > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S > deleted file mode 100644 > index bf82ee447d..0000000000 > --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCPY > -#define STRCPY __strncpy_ssse3 > -#include "strcpy-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (2 preceding siblings ...) 2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 18:13 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein 2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu 5 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result it is no longer worth it to keep the SSSE3 versions given the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +- sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - 5 files changed, 6 insertions(+), 3212 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 5b02ec8de5..303fb5d734 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -17,7 +17,6 @@ sysdep_routines += \ memcmpeq-evex \ memcmpeq-sse2 \ memcpy-ssse3 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ @@ -25,7 +24,6 @@ sysdep_routines += \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ memmove-ssse3 \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 49ce6860d0..c6008a73ed 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3) @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3) @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3) @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..fb01fbb301 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + return OPTIMIZE (ssse3); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) -L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx -L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy_fwd): - sub $0x80, %rdx -L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx -L(ll_cache_copy_fwd): - add %rcx, %rdx -L(ll_cache_copy_fwd_start): - sub $0x80, %rdx -L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) -L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx -L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy): - sub $0x80, %rdx -L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx -L(ll_cache_copy): - add %rcx, %rdx -L(ll_cache_copy_bwd_start): - sub $0x80, %rdx -L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) -L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -L(fwd_write_0bytes): - ret - - - .p2align 4 -L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -L(bwd_write_0bytes): - ret - - .p2align 4 -L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - - .p2align 4 -L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - - .p2align 3 -L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - - .p2align 3 -L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S deleted file mode 100644 index f9a4e9aff9..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_back -#define MEMCPY_CHK __memmove_chk_ssse3_back -#include "memcpy-ssse3-back.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back 2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-04-14 18:13 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-04-14 18:13 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - > sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +- > sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- > sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - > 5 files changed, 6 insertions(+), 3212 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 5b02ec8de5..303fb5d734 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -17,7 +17,6 @@ sysdep_routines += \ > memcmpeq-evex \ > memcmpeq-sse2 \ > memcpy-ssse3 \ > - memcpy-ssse3-back \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > @@ -25,7 +24,6 @@ sysdep_routines += \ > memmove-evex-unaligned-erms \ > memmove-sse2-unaligned-erms \ > memmove-ssse3 \ > - memmove-ssse3-back \ > memrchr-avx2 \ > memrchr-avx2-rtm \ > memrchr-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 49ce6860d0..c6008a73ed 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memmove_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memmove_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (SSSE3), > __memmove_chk_ssse3) > @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_avx512_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > - __memmove_ssse3_back) > IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > __memmove_ssse3) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) > @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __memcpy_chk_ssse3) > @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > - __memcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > __memcpy_ssse3) > IFUNC_IMPL_ADD (array, i, memcpy, > @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __mempcpy_chk_ssse3) > @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, mempcpy, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > __mempcpy_ssse3) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h > index f8f958064c..fb01fbb301 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h > @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) > attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) > attribute_hidden; > @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void) > } > } > > - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > - return OPTIMIZE (sse2_unaligned_erms); > - > - return OPTIMIZE (sse2_unaligned); > + return OPTIMIZE (ssse3); > } > > - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) > - return OPTIMIZE (ssse3_back); > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE (sse2_unaligned_erms); > > - return OPTIMIZE (ssse3); > + return OPTIMIZE (sse2_unaligned); > } > diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > deleted file mode 100644 > index 92cfbf7933..0000000000 > --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > +++ /dev/null > @@ -1,3181 +0,0 @@ > -/* memcpy with SSSE3 and REP string > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#if IS_IN (libc) > - > -#include "asm-syntax.h" > - > -#ifndef MEMCPY > -# define MEMCPY __memcpy_ssse3_back > -# define MEMCPY_CHK __memcpy_chk_ssse3_back > -# define MEMPCPY __mempcpy_ssse3_back > -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back > -#endif > - > -#define JMPTBL(I, B) I - B > - > -/* Branch to an entry in a jump table. TABLE is a jump table with > - relative offsets. INDEX is a register contains the index into the > - jump table. SCALE is the scale of INDEX. */ > -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), INDEX; \ > - lea (%r11, INDEX), INDEX; \ > - _CET_NOTRACK jmp *INDEX; \ > - ud2 > - > - .section .text.ssse3,"ax",@progbits > -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE > -ENTRY (MEMPCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMPCPY_CHK) > - > -ENTRY (MEMPCPY) > - mov %RDI_LP, %RAX_LP > - add %RDX_LP, %RAX_LP > - jmp L(start) > -END (MEMPCPY) > -#endif > - > -#if !defined USE_AS_BCOPY > -ENTRY (MEMCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMCPY_CHK) > -#endif > - > -ENTRY (MEMCPY) > - mov %RDI_LP, %RAX_LP > -#ifdef USE_AS_MEMPCPY > - add %RDX_LP, %RAX_LP > -#endif > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -#endif > - > -#ifdef USE_AS_MEMMOVE > - cmp %rsi, %rdi > - jb L(copy_forward) > - je L(bwd_write_0bytes) > - cmp $144, %rdx > - jae L(copy_backward) > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -L(copy_forward): > -#endif > -L(start): > - cmp $144, %rdx > - jae L(144bytesormore) > - > -L(fwd_write_less32bytes): > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jbe L(bk_write) > -#endif > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > -#ifndef USE_AS_MEMMOVE > -L(bk_write): > - > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -#endif > - > - .p2align 4 > -L(144bytesormore): > - > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jle L(copy_backward) > -#endif > - movdqu (%rsi), %xmm0 > - mov %rdi, %r8 > - and $-16, %rdi > - add $16, %rdi > - mov %rdi, %r9 > - sub %r8, %r9 > - sub %r9, %rdx > - add %r9, %rsi > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0) > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - cmp %rcx, %rdx > - jae L(gobble_mem_fwd) > - lea L(shl_table_fwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(copy_backward): > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - shl $1, %rcx > - cmp %rcx, %rdx > - ja L(gobble_mem_bwd) > - > - add %rdx, %rdi > - add %rdx, %rsi > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $0xf, %r9 > - xor %r9, %rdi > - sub %r9, %rsi > - sub %r9, %rdx > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0_bwd) > - lea L(shl_table_bwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(shl_0): > - > - mov %rdx, %r9 > - shr $8, %r9 > - add %rdx, %r9 > -#ifdef DATA_CACHE_SIZE > - cmp $DATA_CACHE_SIZE_HALF, %R9_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %R9_LP > -#endif > - jae L(gobble_mem_fwd) > - sub $0x80, %rdx > - .p2align 4 > -L(shl_0_loop): > - movdqa (%rsi), %xmm1 > - movdqa %xmm1, (%rdi) > - movaps 0x10(%rsi), %xmm2 > - movaps %xmm2, 0x10(%rdi) > - movaps 0x20(%rsi), %xmm3 > - movaps %xmm3, 0x20(%rdi) > - movaps 0x30(%rsi), %xmm4 > - movaps %xmm4, 0x30(%rdi) > - movaps 0x40(%rsi), %xmm1 > - movaps %xmm1, 0x40(%rdi) > - movaps 0x50(%rsi), %xmm2 > - movaps %xmm2, 0x50(%rdi) > - movaps 0x60(%rsi), %xmm3 > - movaps %xmm3, 0x60(%rdi) > - movaps 0x70(%rsi), %xmm4 > - movaps %xmm4, 0x70(%rdi) > - sub $0x80, %rdx > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(shl_0_loop) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_0_bwd): > - sub $0x80, %rdx > -L(copy_backward_loop): > - movaps -0x10(%rsi), %xmm1 > - movaps %xmm1, -0x10(%rdi) > - movaps -0x20(%rsi), %xmm2 > - movaps %xmm2, -0x20(%rdi) > - movaps -0x30(%rsi), %xmm3 > - movaps %xmm3, -0x30(%rdi) > - movaps -0x40(%rsi), %xmm4 > - movaps %xmm4, -0x40(%rdi) > - movaps -0x50(%rsi), %xmm5 > - movaps %xmm5, -0x50(%rdi) > - movaps -0x60(%rsi), %xmm5 > - movaps %xmm5, -0x60(%rdi) > - movaps -0x70(%rsi), %xmm5 > - movaps %xmm5, -0x70(%rdi) > - movaps -0x80(%rsi), %xmm5 > - movaps %xmm5, -0x80(%rdi) > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(copy_backward_loop) > - > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1): > - sub $0x80, %rdx > - movaps -0x01(%rsi), %xmm1 > - movaps 0x0f(%rsi), %xmm2 > - movaps 0x1f(%rsi), %xmm3 > - movaps 0x2f(%rsi), %xmm4 > - movaps 0x3f(%rsi), %xmm5 > - movaps 0x4f(%rsi), %xmm6 > - movaps 0x5f(%rsi), %xmm7 > - movaps 0x6f(%rsi), %xmm8 > - movaps 0x7f(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $1, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $1, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $1, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $1, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $1, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $1, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $1, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_1) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1_bwd): > - movaps -0x01(%rsi), %xmm1 > - > - movaps -0x11(%rsi), %xmm2 > - palignr $1, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x21(%rsi), %xmm3 > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x31(%rsi), %xmm4 > - palignr $1, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x41(%rsi), %xmm5 > - palignr $1, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x51(%rsi), %xmm6 > - palignr $1, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x61(%rsi), %xmm7 > - palignr $1, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x71(%rsi), %xmm8 > - palignr $1, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x81(%rsi), %xmm9 > - palignr $1, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_1_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2): > - sub $0x80, %rdx > - movaps -0x02(%rsi), %xmm1 > - movaps 0x0e(%rsi), %xmm2 > - movaps 0x1e(%rsi), %xmm3 > - movaps 0x2e(%rsi), %xmm4 > - movaps 0x3e(%rsi), %xmm5 > - movaps 0x4e(%rsi), %xmm6 > - movaps 0x5e(%rsi), %xmm7 > - movaps 0x6e(%rsi), %xmm8 > - movaps 0x7e(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $2, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $2, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $2, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $2, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $2, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $2, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $2, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_2) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2_bwd): > - movaps -0x02(%rsi), %xmm1 > - > - movaps -0x12(%rsi), %xmm2 > - palignr $2, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x22(%rsi), %xmm3 > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x32(%rsi), %xmm4 > - palignr $2, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x42(%rsi), %xmm5 > - palignr $2, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x52(%rsi), %xmm6 > - palignr $2, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x62(%rsi), %xmm7 > - palignr $2, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x72(%rsi), %xmm8 > - palignr $2, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x82(%rsi), %xmm9 > - palignr $2, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_2_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3): > - sub $0x80, %rdx > - movaps -0x03(%rsi), %xmm1 > - movaps 0x0d(%rsi), %xmm2 > - movaps 0x1d(%rsi), %xmm3 > - movaps 0x2d(%rsi), %xmm4 > - movaps 0x3d(%rsi), %xmm5 > - movaps 0x4d(%rsi), %xmm6 > - movaps 0x5d(%rsi), %xmm7 > - movaps 0x6d(%rsi), %xmm8 > - movaps 0x7d(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $3, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $3, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $3, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $3, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $3, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $3, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $3, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_3) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3_bwd): > - movaps -0x03(%rsi), %xmm1 > - > - movaps -0x13(%rsi), %xmm2 > - palignr $3, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x23(%rsi), %xmm3 > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x33(%rsi), %xmm4 > - palignr $3, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x43(%rsi), %xmm5 > - palignr $3, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x53(%rsi), %xmm6 > - palignr $3, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x63(%rsi), %xmm7 > - palignr $3, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x73(%rsi), %xmm8 > - palignr $3, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x83(%rsi), %xmm9 > - palignr $3, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_3_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4): > - sub $0x80, %rdx > - movaps -0x04(%rsi), %xmm1 > - movaps 0x0c(%rsi), %xmm2 > - movaps 0x1c(%rsi), %xmm3 > - movaps 0x2c(%rsi), %xmm4 > - movaps 0x3c(%rsi), %xmm5 > - movaps 0x4c(%rsi), %xmm6 > - movaps 0x5c(%rsi), %xmm7 > - movaps 0x6c(%rsi), %xmm8 > - movaps 0x7c(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $4, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $4, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $4, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $4, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $4, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $4, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $4, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_4) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4_bwd): > - movaps -0x04(%rsi), %xmm1 > - > - movaps -0x14(%rsi), %xmm2 > - palignr $4, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x24(%rsi), %xmm3 > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x34(%rsi), %xmm4 > - palignr $4, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x44(%rsi), %xmm5 > - palignr $4, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x54(%rsi), %xmm6 > - palignr $4, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x64(%rsi), %xmm7 > - palignr $4, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x74(%rsi), %xmm8 > - palignr $4, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x84(%rsi), %xmm9 > - palignr $4, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_4_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5): > - sub $0x80, %rdx > - movaps -0x05(%rsi), %xmm1 > - movaps 0x0b(%rsi), %xmm2 > - movaps 0x1b(%rsi), %xmm3 > - movaps 0x2b(%rsi), %xmm4 > - movaps 0x3b(%rsi), %xmm5 > - movaps 0x4b(%rsi), %xmm6 > - movaps 0x5b(%rsi), %xmm7 > - movaps 0x6b(%rsi), %xmm8 > - movaps 0x7b(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $5, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $5, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $5, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $5, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $5, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $5, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $5, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_5) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5_bwd): > - movaps -0x05(%rsi), %xmm1 > - > - movaps -0x15(%rsi), %xmm2 > - palignr $5, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x25(%rsi), %xmm3 > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x35(%rsi), %xmm4 > - palignr $5, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x45(%rsi), %xmm5 > - palignr $5, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x55(%rsi), %xmm6 > - palignr $5, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x65(%rsi), %xmm7 > - palignr $5, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x75(%rsi), %xmm8 > - palignr $5, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x85(%rsi), %xmm9 > - palignr $5, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_5_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6): > - sub $0x80, %rdx > - movaps -0x06(%rsi), %xmm1 > - movaps 0x0a(%rsi), %xmm2 > - movaps 0x1a(%rsi), %xmm3 > - movaps 0x2a(%rsi), %xmm4 > - movaps 0x3a(%rsi), %xmm5 > - movaps 0x4a(%rsi), %xmm6 > - movaps 0x5a(%rsi), %xmm7 > - movaps 0x6a(%rsi), %xmm8 > - movaps 0x7a(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $6, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $6, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $6, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $6, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $6, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $6, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $6, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_6) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6_bwd): > - movaps -0x06(%rsi), %xmm1 > - > - movaps -0x16(%rsi), %xmm2 > - palignr $6, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x26(%rsi), %xmm3 > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x36(%rsi), %xmm4 > - palignr $6, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x46(%rsi), %xmm5 > - palignr $6, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x56(%rsi), %xmm6 > - palignr $6, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x66(%rsi), %xmm7 > - palignr $6, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x76(%rsi), %xmm8 > - palignr $6, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x86(%rsi), %xmm9 > - palignr $6, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_6_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7): > - sub $0x80, %rdx > - movaps -0x07(%rsi), %xmm1 > - movaps 0x09(%rsi), %xmm2 > - movaps 0x19(%rsi), %xmm3 > - movaps 0x29(%rsi), %xmm4 > - movaps 0x39(%rsi), %xmm5 > - movaps 0x49(%rsi), %xmm6 > - movaps 0x59(%rsi), %xmm7 > - movaps 0x69(%rsi), %xmm8 > - movaps 0x79(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $7, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $7, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $7, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $7, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $7, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $7, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $7, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_7) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7_bwd): > - movaps -0x07(%rsi), %xmm1 > - > - movaps -0x17(%rsi), %xmm2 > - palignr $7, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x27(%rsi), %xmm3 > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x37(%rsi), %xmm4 > - palignr $7, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x47(%rsi), %xmm5 > - palignr $7, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x57(%rsi), %xmm6 > - palignr $7, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x67(%rsi), %xmm7 > - palignr $7, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x77(%rsi), %xmm8 > - palignr $7, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x87(%rsi), %xmm9 > - palignr $7, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_7_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8): > - sub $0x80, %rdx > - movaps -0x08(%rsi), %xmm1 > - movaps 0x08(%rsi), %xmm2 > - movaps 0x18(%rsi), %xmm3 > - movaps 0x28(%rsi), %xmm4 > - movaps 0x38(%rsi), %xmm5 > - movaps 0x48(%rsi), %xmm6 > - movaps 0x58(%rsi), %xmm7 > - movaps 0x68(%rsi), %xmm8 > - movaps 0x78(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $8, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $8, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $8, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $8, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $8, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $8, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $8, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_8) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8_bwd): > - movaps -0x08(%rsi), %xmm1 > - > - movaps -0x18(%rsi), %xmm2 > - palignr $8, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x28(%rsi), %xmm3 > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x38(%rsi), %xmm4 > - palignr $8, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x48(%rsi), %xmm5 > - palignr $8, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x58(%rsi), %xmm6 > - palignr $8, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x68(%rsi), %xmm7 > - palignr $8, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x78(%rsi), %xmm8 > - palignr $8, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x88(%rsi), %xmm9 > - palignr $8, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_8_bwd) > -L(shl_8_end_bwd): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9): > - sub $0x80, %rdx > - movaps -0x09(%rsi), %xmm1 > - movaps 0x07(%rsi), %xmm2 > - movaps 0x17(%rsi), %xmm3 > - movaps 0x27(%rsi), %xmm4 > - movaps 0x37(%rsi), %xmm5 > - movaps 0x47(%rsi), %xmm6 > - movaps 0x57(%rsi), %xmm7 > - movaps 0x67(%rsi), %xmm8 > - movaps 0x77(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $9, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $9, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $9, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $9, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $9, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $9, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $9, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_9) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9_bwd): > - movaps -0x09(%rsi), %xmm1 > - > - movaps -0x19(%rsi), %xmm2 > - palignr $9, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x29(%rsi), %xmm3 > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x39(%rsi), %xmm4 > - palignr $9, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x49(%rsi), %xmm5 > - palignr $9, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x59(%rsi), %xmm6 > - palignr $9, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x69(%rsi), %xmm7 > - palignr $9, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x79(%rsi), %xmm8 > - palignr $9, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x89(%rsi), %xmm9 > - palignr $9, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_9_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10): > - sub $0x80, %rdx > - movaps -0x0a(%rsi), %xmm1 > - movaps 0x06(%rsi), %xmm2 > - movaps 0x16(%rsi), %xmm3 > - movaps 0x26(%rsi), %xmm4 > - movaps 0x36(%rsi), %xmm5 > - movaps 0x46(%rsi), %xmm6 > - movaps 0x56(%rsi), %xmm7 > - movaps 0x66(%rsi), %xmm8 > - movaps 0x76(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $10, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $10, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $10, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $10, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $10, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $10, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $10, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_10) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10_bwd): > - movaps -0x0a(%rsi), %xmm1 > - > - movaps -0x1a(%rsi), %xmm2 > - palignr $10, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2a(%rsi), %xmm3 > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3a(%rsi), %xmm4 > - palignr $10, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4a(%rsi), %xmm5 > - palignr $10, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5a(%rsi), %xmm6 > - palignr $10, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6a(%rsi), %xmm7 > - palignr $10, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7a(%rsi), %xmm8 > - palignr $10, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8a(%rsi), %xmm9 > - palignr $10, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_10_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11): > - sub $0x80, %rdx > - movaps -0x0b(%rsi), %xmm1 > - movaps 0x05(%rsi), %xmm2 > - movaps 0x15(%rsi), %xmm3 > - movaps 0x25(%rsi), %xmm4 > - movaps 0x35(%rsi), %xmm5 > - movaps 0x45(%rsi), %xmm6 > - movaps 0x55(%rsi), %xmm7 > - movaps 0x65(%rsi), %xmm8 > - movaps 0x75(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $11, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $11, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $11, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $11, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $11, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $11, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $11, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_11) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11_bwd): > - movaps -0x0b(%rsi), %xmm1 > - > - movaps -0x1b(%rsi), %xmm2 > - palignr $11, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2b(%rsi), %xmm3 > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3b(%rsi), %xmm4 > - palignr $11, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4b(%rsi), %xmm5 > - palignr $11, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5b(%rsi), %xmm6 > - palignr $11, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6b(%rsi), %xmm7 > - palignr $11, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7b(%rsi), %xmm8 > - palignr $11, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8b(%rsi), %xmm9 > - palignr $11, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_11_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12): > - sub $0x80, %rdx > - movdqa -0x0c(%rsi), %xmm1 > - movaps 0x04(%rsi), %xmm2 > - movaps 0x14(%rsi), %xmm3 > - movaps 0x24(%rsi), %xmm4 > - movaps 0x34(%rsi), %xmm5 > - movaps 0x44(%rsi), %xmm6 > - movaps 0x54(%rsi), %xmm7 > - movaps 0x64(%rsi), %xmm8 > - movaps 0x74(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $12, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $12, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $12, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $12, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $12, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $12, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $12, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - > - lea 0x80(%rdi), %rdi > - jae L(shl_12) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12_bwd): > - movaps -0x0c(%rsi), %xmm1 > - > - movaps -0x1c(%rsi), %xmm2 > - palignr $12, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2c(%rsi), %xmm3 > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3c(%rsi), %xmm4 > - palignr $12, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4c(%rsi), %xmm5 > - palignr $12, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5c(%rsi), %xmm6 > - palignr $12, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6c(%rsi), %xmm7 > - palignr $12, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7c(%rsi), %xmm8 > - palignr $12, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8c(%rsi), %xmm9 > - palignr $12, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_12_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13): > - sub $0x80, %rdx > - movaps -0x0d(%rsi), %xmm1 > - movaps 0x03(%rsi), %xmm2 > - movaps 0x13(%rsi), %xmm3 > - movaps 0x23(%rsi), %xmm4 > - movaps 0x33(%rsi), %xmm5 > - movaps 0x43(%rsi), %xmm6 > - movaps 0x53(%rsi), %xmm7 > - movaps 0x63(%rsi), %xmm8 > - movaps 0x73(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $13, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $13, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $13, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $13, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $13, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $13, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $13, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_13) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13_bwd): > - movaps -0x0d(%rsi), %xmm1 > - > - movaps -0x1d(%rsi), %xmm2 > - palignr $13, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2d(%rsi), %xmm3 > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3d(%rsi), %xmm4 > - palignr $13, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4d(%rsi), %xmm5 > - palignr $13, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5d(%rsi), %xmm6 > - palignr $13, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6d(%rsi), %xmm7 > - palignr $13, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7d(%rsi), %xmm8 > - palignr $13, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8d(%rsi), %xmm9 > - palignr $13, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_13_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14): > - sub $0x80, %rdx > - movaps -0x0e(%rsi), %xmm1 > - movaps 0x02(%rsi), %xmm2 > - movaps 0x12(%rsi), %xmm3 > - movaps 0x22(%rsi), %xmm4 > - movaps 0x32(%rsi), %xmm5 > - movaps 0x42(%rsi), %xmm6 > - movaps 0x52(%rsi), %xmm7 > - movaps 0x62(%rsi), %xmm8 > - movaps 0x72(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $14, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $14, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $14, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $14, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $14, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $14, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $14, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_14) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14_bwd): > - movaps -0x0e(%rsi), %xmm1 > - > - movaps -0x1e(%rsi), %xmm2 > - palignr $14, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2e(%rsi), %xmm3 > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3e(%rsi), %xmm4 > - palignr $14, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4e(%rsi), %xmm5 > - palignr $14, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5e(%rsi), %xmm6 > - palignr $14, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6e(%rsi), %xmm7 > - palignr $14, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7e(%rsi), %xmm8 > - palignr $14, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8e(%rsi), %xmm9 > - palignr $14, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_14_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15): > - sub $0x80, %rdx > - movaps -0x0f(%rsi), %xmm1 > - movaps 0x01(%rsi), %xmm2 > - movaps 0x11(%rsi), %xmm3 > - movaps 0x21(%rsi), %xmm4 > - movaps 0x31(%rsi), %xmm5 > - movaps 0x41(%rsi), %xmm6 > - movaps 0x51(%rsi), %xmm7 > - movaps 0x61(%rsi), %xmm8 > - movaps 0x71(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $15, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $15, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $15, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $15, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $15, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $15, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $15, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_15) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15_bwd): > - movaps -0x0f(%rsi), %xmm1 > - > - movaps -0x1f(%rsi), %xmm2 > - palignr $15, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2f(%rsi), %xmm3 > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3f(%rsi), %xmm4 > - palignr $15, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4f(%rsi), %xmm5 > - palignr $15, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5f(%rsi), %xmm6 > - palignr $15, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6f(%rsi), %xmm7 > - palignr $15, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7f(%rsi), %xmm8 > - palignr $15, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8f(%rsi), %xmm9 > - palignr $15, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_15_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_fwd): > - movdqu (%rsi), %xmm1 > - movdqu %xmm0, (%r8) > - movdqa %xmm1, (%rdi) > - sub $16, %rdx > - add $16, %rsi > - add $16, %rdi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rsi, %r9 > - sub %rdi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_fwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_fwd_start) > -L(memmove_is_memcpy_fwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger_in_fwd) > - mov %rdx, %rcx > -L(bigger_in_fwd): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy_fwd) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy_fwd) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy_fwd): > - sub $0x80, %rdx > -L(gobble_mem_fwd_loop): > - sub $0x80, %rdx > - prefetcht0 0x200(%rsi) > - prefetcht0 0x300(%rsi) > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lfence > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - movntdq %xmm4, 0x40(%rdi) > - movntdq %xmm5, 0x50(%rdi) > - movntdq %xmm6, 0x60(%rdi) > - movntdq %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_mem_fwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_fwd_end) > - add $0x80, %rdx > -L(ll_cache_copy_fwd): > - add %rcx, %rdx > -L(ll_cache_copy_fwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop_fwd): > - prefetchnta 0x1c0(%rsi) > - prefetchnta 0x280(%rsi) > - prefetchnta 0x1c0(%rdi) > - prefetchnta 0x280(%rdi) > - sub $0x80, %rdx > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - movdqa %xmm2, 0x20(%rdi) > - movdqa %xmm3, 0x30(%rdi) > - movdqa %xmm4, 0x40(%rdi) > - movdqa %xmm5, 0x50(%rdi) > - movdqa %xmm6, 0x60(%rdi) > - movdqa %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_ll_loop_fwd) > -L(gobble_mem_fwd_end): > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_bwd): > - add %rdx, %rsi > - add %rdx, %rdi > - > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $-16, %rdi > - sub %rdi, %r9 > - sub %r9, %rsi > - sub %r9, %rdx > - > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rdi, %r9 > - sub %rsi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_bwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_bwd_start) > -L(memmove_is_memcpy_bwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger) > - mov %rdx, %rcx > -L(bigger): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy): > - sub $0x80, %rdx > -L(gobble_mem_bwd_loop): > - sub $0x80, %rdx > - prefetcht0 -0x200(%rsi) > - prefetcht0 -0x300(%rsi) > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - lfence > - movntdq %xmm1, -0x10(%rdi) > - movntdq %xmm2, -0x20(%rdi) > - movntdq %xmm3, -0x30(%rdi) > - movntdq %xmm4, -0x40(%rdi) > - movntdq %xmm5, -0x50(%rdi) > - movntdq %xmm6, -0x60(%rdi) > - movntdq %xmm7, -0x70(%rdi) > - movntdq %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_mem_bwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_bwd_end) > - add $0x80, %rdx > -L(ll_cache_copy): > - add %rcx, %rdx > -L(ll_cache_copy_bwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop): > - prefetchnta -0x1c0(%rsi) > - prefetchnta -0x280(%rsi) > - prefetchnta -0x1c0(%rdi) > - prefetchnta -0x280(%rdi) > - sub $0x80, %rdx > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - movdqa %xmm1, -0x10(%rdi) > - movdqa %xmm2, -0x20(%rdi) > - movdqa %xmm3, -0x30(%rdi) > - movdqa %xmm4, -0x40(%rdi) > - movdqa %xmm5, -0x50(%rdi) > - movdqa %xmm6, -0x60(%rdi) > - movdqa %xmm7, -0x70(%rdi) > - movdqa %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_ll_loop) > -L(gobble_mem_bwd_end): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rsi > - sub %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(fwd_write_128bytes): > - lddqu -128(%rsi), %xmm0 > - movdqu %xmm0, -128(%rdi) > -L(fwd_write_112bytes): > - lddqu -112(%rsi), %xmm0 > - movdqu %xmm0, -112(%rdi) > -L(fwd_write_96bytes): > - lddqu -96(%rsi), %xmm0 > - movdqu %xmm0, -96(%rdi) > -L(fwd_write_80bytes): > - lddqu -80(%rsi), %xmm0 > - movdqu %xmm0, -80(%rdi) > -L(fwd_write_64bytes): > - lddqu -64(%rsi), %xmm0 > - movdqu %xmm0, -64(%rdi) > -L(fwd_write_48bytes): > - lddqu -48(%rsi), %xmm0 > - movdqu %xmm0, -48(%rdi) > -L(fwd_write_32bytes): > - lddqu -32(%rsi), %xmm0 > - movdqu %xmm0, -32(%rdi) > -L(fwd_write_16bytes): > - lddqu -16(%rsi), %xmm0 > - movdqu %xmm0, -16(%rdi) > -L(fwd_write_0bytes): > - ret > - > - > - .p2align 4 > -L(fwd_write_143bytes): > - lddqu -143(%rsi), %xmm0 > - movdqu %xmm0, -143(%rdi) > -L(fwd_write_127bytes): > - lddqu -127(%rsi), %xmm0 > - movdqu %xmm0, -127(%rdi) > -L(fwd_write_111bytes): > - lddqu -111(%rsi), %xmm0 > - movdqu %xmm0, -111(%rdi) > -L(fwd_write_95bytes): > - lddqu -95(%rsi), %xmm0 > - movdqu %xmm0, -95(%rdi) > -L(fwd_write_79bytes): > - lddqu -79(%rsi), %xmm0 > - movdqu %xmm0, -79(%rdi) > -L(fwd_write_63bytes): > - lddqu -63(%rsi), %xmm0 > - movdqu %xmm0, -63(%rdi) > -L(fwd_write_47bytes): > - lddqu -47(%rsi), %xmm0 > - movdqu %xmm0, -47(%rdi) > -L(fwd_write_31bytes): > - lddqu -31(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -31(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_15bytes): > - mov -15(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -15(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_142bytes): > - lddqu -142(%rsi), %xmm0 > - movdqu %xmm0, -142(%rdi) > -L(fwd_write_126bytes): > - lddqu -126(%rsi), %xmm0 > - movdqu %xmm0, -126(%rdi) > -L(fwd_write_110bytes): > - lddqu -110(%rsi), %xmm0 > - movdqu %xmm0, -110(%rdi) > -L(fwd_write_94bytes): > - lddqu -94(%rsi), %xmm0 > - movdqu %xmm0, -94(%rdi) > -L(fwd_write_78bytes): > - lddqu -78(%rsi), %xmm0 > - movdqu %xmm0, -78(%rdi) > -L(fwd_write_62bytes): > - lddqu -62(%rsi), %xmm0 > - movdqu %xmm0, -62(%rdi) > -L(fwd_write_46bytes): > - lddqu -46(%rsi), %xmm0 > - movdqu %xmm0, -46(%rdi) > -L(fwd_write_30bytes): > - lddqu -30(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -30(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_14bytes): > - mov -14(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -14(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_141bytes): > - lddqu -141(%rsi), %xmm0 > - movdqu %xmm0, -141(%rdi) > -L(fwd_write_125bytes): > - lddqu -125(%rsi), %xmm0 > - movdqu %xmm0, -125(%rdi) > -L(fwd_write_109bytes): > - lddqu -109(%rsi), %xmm0 > - movdqu %xmm0, -109(%rdi) > -L(fwd_write_93bytes): > - lddqu -93(%rsi), %xmm0 > - movdqu %xmm0, -93(%rdi) > -L(fwd_write_77bytes): > - lddqu -77(%rsi), %xmm0 > - movdqu %xmm0, -77(%rdi) > -L(fwd_write_61bytes): > - lddqu -61(%rsi), %xmm0 > - movdqu %xmm0, -61(%rdi) > -L(fwd_write_45bytes): > - lddqu -45(%rsi), %xmm0 > - movdqu %xmm0, -45(%rdi) > -L(fwd_write_29bytes): > - lddqu -29(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -29(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_13bytes): > - mov -13(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -13(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_140bytes): > - lddqu -140(%rsi), %xmm0 > - movdqu %xmm0, -140(%rdi) > -L(fwd_write_124bytes): > - lddqu -124(%rsi), %xmm0 > - movdqu %xmm0, -124(%rdi) > -L(fwd_write_108bytes): > - lddqu -108(%rsi), %xmm0 > - movdqu %xmm0, -108(%rdi) > -L(fwd_write_92bytes): > - lddqu -92(%rsi), %xmm0 > - movdqu %xmm0, -92(%rdi) > -L(fwd_write_76bytes): > - lddqu -76(%rsi), %xmm0 > - movdqu %xmm0, -76(%rdi) > -L(fwd_write_60bytes): > - lddqu -60(%rsi), %xmm0 > - movdqu %xmm0, -60(%rdi) > -L(fwd_write_44bytes): > - lddqu -44(%rsi), %xmm0 > - movdqu %xmm0, -44(%rdi) > -L(fwd_write_28bytes): > - lddqu -28(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -28(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_12bytes): > - mov -12(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -12(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_139bytes): > - lddqu -139(%rsi), %xmm0 > - movdqu %xmm0, -139(%rdi) > -L(fwd_write_123bytes): > - lddqu -123(%rsi), %xmm0 > - movdqu %xmm0, -123(%rdi) > -L(fwd_write_107bytes): > - lddqu -107(%rsi), %xmm0 > - movdqu %xmm0, -107(%rdi) > -L(fwd_write_91bytes): > - lddqu -91(%rsi), %xmm0 > - movdqu %xmm0, -91(%rdi) > -L(fwd_write_75bytes): > - lddqu -75(%rsi), %xmm0 > - movdqu %xmm0, -75(%rdi) > -L(fwd_write_59bytes): > - lddqu -59(%rsi), %xmm0 > - movdqu %xmm0, -59(%rdi) > -L(fwd_write_43bytes): > - lddqu -43(%rsi), %xmm0 > - movdqu %xmm0, -43(%rdi) > -L(fwd_write_27bytes): > - lddqu -27(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -27(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_11bytes): > - mov -11(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -11(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_138bytes): > - lddqu -138(%rsi), %xmm0 > - movdqu %xmm0, -138(%rdi) > -L(fwd_write_122bytes): > - lddqu -122(%rsi), %xmm0 > - movdqu %xmm0, -122(%rdi) > -L(fwd_write_106bytes): > - lddqu -106(%rsi), %xmm0 > - movdqu %xmm0, -106(%rdi) > -L(fwd_write_90bytes): > - lddqu -90(%rsi), %xmm0 > - movdqu %xmm0, -90(%rdi) > -L(fwd_write_74bytes): > - lddqu -74(%rsi), %xmm0 > - movdqu %xmm0, -74(%rdi) > -L(fwd_write_58bytes): > - lddqu -58(%rsi), %xmm0 > - movdqu %xmm0, -58(%rdi) > -L(fwd_write_42bytes): > - lddqu -42(%rsi), %xmm0 > - movdqu %xmm0, -42(%rdi) > -L(fwd_write_26bytes): > - lddqu -26(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -26(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_10bytes): > - mov -10(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -10(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_137bytes): > - lddqu -137(%rsi), %xmm0 > - movdqu %xmm0, -137(%rdi) > -L(fwd_write_121bytes): > - lddqu -121(%rsi), %xmm0 > - movdqu %xmm0, -121(%rdi) > -L(fwd_write_105bytes): > - lddqu -105(%rsi), %xmm0 > - movdqu %xmm0, -105(%rdi) > -L(fwd_write_89bytes): > - lddqu -89(%rsi), %xmm0 > - movdqu %xmm0, -89(%rdi) > -L(fwd_write_73bytes): > - lddqu -73(%rsi), %xmm0 > - movdqu %xmm0, -73(%rdi) > -L(fwd_write_57bytes): > - lddqu -57(%rsi), %xmm0 > - movdqu %xmm0, -57(%rdi) > -L(fwd_write_41bytes): > - lddqu -41(%rsi), %xmm0 > - movdqu %xmm0, -41(%rdi) > -L(fwd_write_25bytes): > - lddqu -25(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -25(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_9bytes): > - mov -9(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -9(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_136bytes): > - lddqu -136(%rsi), %xmm0 > - movdqu %xmm0, -136(%rdi) > -L(fwd_write_120bytes): > - lddqu -120(%rsi), %xmm0 > - movdqu %xmm0, -120(%rdi) > -L(fwd_write_104bytes): > - lddqu -104(%rsi), %xmm0 > - movdqu %xmm0, -104(%rdi) > -L(fwd_write_88bytes): > - lddqu -88(%rsi), %xmm0 > - movdqu %xmm0, -88(%rdi) > -L(fwd_write_72bytes): > - lddqu -72(%rsi), %xmm0 > - movdqu %xmm0, -72(%rdi) > -L(fwd_write_56bytes): > - lddqu -56(%rsi), %xmm0 > - movdqu %xmm0, -56(%rdi) > -L(fwd_write_40bytes): > - lddqu -40(%rsi), %xmm0 > - movdqu %xmm0, -40(%rdi) > -L(fwd_write_24bytes): > - lddqu -24(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -24(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_8bytes): > - mov -8(%rsi), %rdx > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_135bytes): > - lddqu -135(%rsi), %xmm0 > - movdqu %xmm0, -135(%rdi) > -L(fwd_write_119bytes): > - lddqu -119(%rsi), %xmm0 > - movdqu %xmm0, -119(%rdi) > -L(fwd_write_103bytes): > - lddqu -103(%rsi), %xmm0 > - movdqu %xmm0, -103(%rdi) > -L(fwd_write_87bytes): > - lddqu -87(%rsi), %xmm0 > - movdqu %xmm0, -87(%rdi) > -L(fwd_write_71bytes): > - lddqu -71(%rsi), %xmm0 > - movdqu %xmm0, -71(%rdi) > -L(fwd_write_55bytes): > - lddqu -55(%rsi), %xmm0 > - movdqu %xmm0, -55(%rdi) > -L(fwd_write_39bytes): > - lddqu -39(%rsi), %xmm0 > - movdqu %xmm0, -39(%rdi) > -L(fwd_write_23bytes): > - lddqu -23(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -23(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_7bytes): > - mov -7(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -7(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_134bytes): > - lddqu -134(%rsi), %xmm0 > - movdqu %xmm0, -134(%rdi) > -L(fwd_write_118bytes): > - lddqu -118(%rsi), %xmm0 > - movdqu %xmm0, -118(%rdi) > -L(fwd_write_102bytes): > - lddqu -102(%rsi), %xmm0 > - movdqu %xmm0, -102(%rdi) > -L(fwd_write_86bytes): > - lddqu -86(%rsi), %xmm0 > - movdqu %xmm0, -86(%rdi) > -L(fwd_write_70bytes): > - lddqu -70(%rsi), %xmm0 > - movdqu %xmm0, -70(%rdi) > -L(fwd_write_54bytes): > - lddqu -54(%rsi), %xmm0 > - movdqu %xmm0, -54(%rdi) > -L(fwd_write_38bytes): > - lddqu -38(%rsi), %xmm0 > - movdqu %xmm0, -38(%rdi) > -L(fwd_write_22bytes): > - lddqu -22(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -22(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_6bytes): > - mov -6(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -6(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_133bytes): > - lddqu -133(%rsi), %xmm0 > - movdqu %xmm0, -133(%rdi) > -L(fwd_write_117bytes): > - lddqu -117(%rsi), %xmm0 > - movdqu %xmm0, -117(%rdi) > -L(fwd_write_101bytes): > - lddqu -101(%rsi), %xmm0 > - movdqu %xmm0, -101(%rdi) > -L(fwd_write_85bytes): > - lddqu -85(%rsi), %xmm0 > - movdqu %xmm0, -85(%rdi) > -L(fwd_write_69bytes): > - lddqu -69(%rsi), %xmm0 > - movdqu %xmm0, -69(%rdi) > -L(fwd_write_53bytes): > - lddqu -53(%rsi), %xmm0 > - movdqu %xmm0, -53(%rdi) > -L(fwd_write_37bytes): > - lddqu -37(%rsi), %xmm0 > - movdqu %xmm0, -37(%rdi) > -L(fwd_write_21bytes): > - lddqu -21(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -21(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_5bytes): > - mov -5(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -5(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_132bytes): > - lddqu -132(%rsi), %xmm0 > - movdqu %xmm0, -132(%rdi) > -L(fwd_write_116bytes): > - lddqu -116(%rsi), %xmm0 > - movdqu %xmm0, -116(%rdi) > -L(fwd_write_100bytes): > - lddqu -100(%rsi), %xmm0 > - movdqu %xmm0, -100(%rdi) > -L(fwd_write_84bytes): > - lddqu -84(%rsi), %xmm0 > - movdqu %xmm0, -84(%rdi) > -L(fwd_write_68bytes): > - lddqu -68(%rsi), %xmm0 > - movdqu %xmm0, -68(%rdi) > -L(fwd_write_52bytes): > - lddqu -52(%rsi), %xmm0 > - movdqu %xmm0, -52(%rdi) > -L(fwd_write_36bytes): > - lddqu -36(%rsi), %xmm0 > - movdqu %xmm0, -36(%rdi) > -L(fwd_write_20bytes): > - lddqu -20(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -20(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_4bytes): > - mov -4(%rsi), %edx > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_131bytes): > - lddqu -131(%rsi), %xmm0 > - movdqu %xmm0, -131(%rdi) > -L(fwd_write_115bytes): > - lddqu -115(%rsi), %xmm0 > - movdqu %xmm0, -115(%rdi) > -L(fwd_write_99bytes): > - lddqu -99(%rsi), %xmm0 > - movdqu %xmm0, -99(%rdi) > -L(fwd_write_83bytes): > - lddqu -83(%rsi), %xmm0 > - movdqu %xmm0, -83(%rdi) > -L(fwd_write_67bytes): > - lddqu -67(%rsi), %xmm0 > - movdqu %xmm0, -67(%rdi) > -L(fwd_write_51bytes): > - lddqu -51(%rsi), %xmm0 > - movdqu %xmm0, -51(%rdi) > -L(fwd_write_35bytes): > - lddqu -35(%rsi), %xmm0 > - movdqu %xmm0, -35(%rdi) > -L(fwd_write_19bytes): > - lddqu -19(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -19(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_3bytes): > - mov -3(%rsi), %dx > - mov -2(%rsi), %cx > - mov %dx, -3(%rdi) > - mov %cx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_130bytes): > - lddqu -130(%rsi), %xmm0 > - movdqu %xmm0, -130(%rdi) > -L(fwd_write_114bytes): > - lddqu -114(%rsi), %xmm0 > - movdqu %xmm0, -114(%rdi) > -L(fwd_write_98bytes): > - lddqu -98(%rsi), %xmm0 > - movdqu %xmm0, -98(%rdi) > -L(fwd_write_82bytes): > - lddqu -82(%rsi), %xmm0 > - movdqu %xmm0, -82(%rdi) > -L(fwd_write_66bytes): > - lddqu -66(%rsi), %xmm0 > - movdqu %xmm0, -66(%rdi) > -L(fwd_write_50bytes): > - lddqu -50(%rsi), %xmm0 > - movdqu %xmm0, -50(%rdi) > -L(fwd_write_34bytes): > - lddqu -34(%rsi), %xmm0 > - movdqu %xmm0, -34(%rdi) > -L(fwd_write_18bytes): > - lddqu -18(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -18(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_2bytes): > - movzwl -2(%rsi), %edx > - mov %dx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_129bytes): > - lddqu -129(%rsi), %xmm0 > - movdqu %xmm0, -129(%rdi) > -L(fwd_write_113bytes): > - lddqu -113(%rsi), %xmm0 > - movdqu %xmm0, -113(%rdi) > -L(fwd_write_97bytes): > - lddqu -97(%rsi), %xmm0 > - movdqu %xmm0, -97(%rdi) > -L(fwd_write_81bytes): > - lddqu -81(%rsi), %xmm0 > - movdqu %xmm0, -81(%rdi) > -L(fwd_write_65bytes): > - lddqu -65(%rsi), %xmm0 > - movdqu %xmm0, -65(%rdi) > -L(fwd_write_49bytes): > - lddqu -49(%rsi), %xmm0 > - movdqu %xmm0, -49(%rdi) > -L(fwd_write_33bytes): > - lddqu -33(%rsi), %xmm0 > - movdqu %xmm0, -33(%rdi) > -L(fwd_write_17bytes): > - lddqu -17(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -17(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_1bytes): > - movzbl -1(%rsi), %edx > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_128bytes): > - lddqu 112(%rsi), %xmm0 > - movdqu %xmm0, 112(%rdi) > -L(bwd_write_112bytes): > - lddqu 96(%rsi), %xmm0 > - movdqu %xmm0, 96(%rdi) > -L(bwd_write_96bytes): > - lddqu 80(%rsi), %xmm0 > - movdqu %xmm0, 80(%rdi) > -L(bwd_write_80bytes): > - lddqu 64(%rsi), %xmm0 > - movdqu %xmm0, 64(%rdi) > -L(bwd_write_64bytes): > - lddqu 48(%rsi), %xmm0 > - movdqu %xmm0, 48(%rdi) > -L(bwd_write_48bytes): > - lddqu 32(%rsi), %xmm0 > - movdqu %xmm0, 32(%rdi) > -L(bwd_write_32bytes): > - lddqu 16(%rsi), %xmm0 > - movdqu %xmm0, 16(%rdi) > -L(bwd_write_16bytes): > - lddqu (%rsi), %xmm0 > - movdqu %xmm0, (%rdi) > -L(bwd_write_0bytes): > - ret > - > - .p2align 4 > -L(bwd_write_143bytes): > - lddqu 127(%rsi), %xmm0 > - movdqu %xmm0, 127(%rdi) > -L(bwd_write_127bytes): > - lddqu 111(%rsi), %xmm0 > - movdqu %xmm0, 111(%rdi) > -L(bwd_write_111bytes): > - lddqu 95(%rsi), %xmm0 > - movdqu %xmm0, 95(%rdi) > -L(bwd_write_95bytes): > - lddqu 79(%rsi), %xmm0 > - movdqu %xmm0, 79(%rdi) > -L(bwd_write_79bytes): > - lddqu 63(%rsi), %xmm0 > - movdqu %xmm0, 63(%rdi) > -L(bwd_write_63bytes): > - lddqu 47(%rsi), %xmm0 > - movdqu %xmm0, 47(%rdi) > -L(bwd_write_47bytes): > - lddqu 31(%rsi), %xmm0 > - movdqu %xmm0, 31(%rdi) > -L(bwd_write_31bytes): > - lddqu 15(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 15(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - > - .p2align 4 > -L(bwd_write_15bytes): > - mov 7(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 7(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_142bytes): > - lddqu 126(%rsi), %xmm0 > - movdqu %xmm0, 126(%rdi) > -L(bwd_write_126bytes): > - lddqu 110(%rsi), %xmm0 > - movdqu %xmm0, 110(%rdi) > -L(bwd_write_110bytes): > - lddqu 94(%rsi), %xmm0 > - movdqu %xmm0, 94(%rdi) > -L(bwd_write_94bytes): > - lddqu 78(%rsi), %xmm0 > - movdqu %xmm0, 78(%rdi) > -L(bwd_write_78bytes): > - lddqu 62(%rsi), %xmm0 > - movdqu %xmm0, 62(%rdi) > -L(bwd_write_62bytes): > - lddqu 46(%rsi), %xmm0 > - movdqu %xmm0, 46(%rdi) > -L(bwd_write_46bytes): > - lddqu 30(%rsi), %xmm0 > - movdqu %xmm0, 30(%rdi) > -L(bwd_write_30bytes): > - lddqu 14(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 14(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_14bytes): > - mov 6(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 6(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_141bytes): > - lddqu 125(%rsi), %xmm0 > - movdqu %xmm0, 125(%rdi) > -L(bwd_write_125bytes): > - lddqu 109(%rsi), %xmm0 > - movdqu %xmm0, 109(%rdi) > -L(bwd_write_109bytes): > - lddqu 93(%rsi), %xmm0 > - movdqu %xmm0, 93(%rdi) > -L(bwd_write_93bytes): > - lddqu 77(%rsi), %xmm0 > - movdqu %xmm0, 77(%rdi) > -L(bwd_write_77bytes): > - lddqu 61(%rsi), %xmm0 > - movdqu %xmm0, 61(%rdi) > -L(bwd_write_61bytes): > - lddqu 45(%rsi), %xmm0 > - movdqu %xmm0, 45(%rdi) > -L(bwd_write_45bytes): > - lddqu 29(%rsi), %xmm0 > - movdqu %xmm0, 29(%rdi) > -L(bwd_write_29bytes): > - lddqu 13(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 13(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_13bytes): > - mov 5(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 5(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_140bytes): > - lddqu 124(%rsi), %xmm0 > - movdqu %xmm0, 124(%rdi) > -L(bwd_write_124bytes): > - lddqu 108(%rsi), %xmm0 > - movdqu %xmm0, 108(%rdi) > -L(bwd_write_108bytes): > - lddqu 92(%rsi), %xmm0 > - movdqu %xmm0, 92(%rdi) > -L(bwd_write_92bytes): > - lddqu 76(%rsi), %xmm0 > - movdqu %xmm0, 76(%rdi) > -L(bwd_write_76bytes): > - lddqu 60(%rsi), %xmm0 > - movdqu %xmm0, 60(%rdi) > -L(bwd_write_60bytes): > - lddqu 44(%rsi), %xmm0 > - movdqu %xmm0, 44(%rdi) > -L(bwd_write_44bytes): > - lddqu 28(%rsi), %xmm0 > - movdqu %xmm0, 28(%rdi) > -L(bwd_write_28bytes): > - lddqu 12(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 12(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_12bytes): > - mov 4(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 4(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_139bytes): > - lddqu 123(%rsi), %xmm0 > - movdqu %xmm0, 123(%rdi) > -L(bwd_write_123bytes): > - lddqu 107(%rsi), %xmm0 > - movdqu %xmm0, 107(%rdi) > -L(bwd_write_107bytes): > - lddqu 91(%rsi), %xmm0 > - movdqu %xmm0, 91(%rdi) > -L(bwd_write_91bytes): > - lddqu 75(%rsi), %xmm0 > - movdqu %xmm0, 75(%rdi) > -L(bwd_write_75bytes): > - lddqu 59(%rsi), %xmm0 > - movdqu %xmm0, 59(%rdi) > -L(bwd_write_59bytes): > - lddqu 43(%rsi), %xmm0 > - movdqu %xmm0, 43(%rdi) > -L(bwd_write_43bytes): > - lddqu 27(%rsi), %xmm0 > - movdqu %xmm0, 27(%rdi) > -L(bwd_write_27bytes): > - lddqu 11(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 11(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_11bytes): > - mov 3(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 3(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_138bytes): > - lddqu 122(%rsi), %xmm0 > - movdqu %xmm0, 122(%rdi) > -L(bwd_write_122bytes): > - lddqu 106(%rsi), %xmm0 > - movdqu %xmm0, 106(%rdi) > -L(bwd_write_106bytes): > - lddqu 90(%rsi), %xmm0 > - movdqu %xmm0, 90(%rdi) > -L(bwd_write_90bytes): > - lddqu 74(%rsi), %xmm0 > - movdqu %xmm0, 74(%rdi) > -L(bwd_write_74bytes): > - lddqu 58(%rsi), %xmm0 > - movdqu %xmm0, 58(%rdi) > -L(bwd_write_58bytes): > - lddqu 42(%rsi), %xmm0 > - movdqu %xmm0, 42(%rdi) > -L(bwd_write_42bytes): > - lddqu 26(%rsi), %xmm0 > - movdqu %xmm0, 26(%rdi) > -L(bwd_write_26bytes): > - lddqu 10(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 10(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_10bytes): > - mov 2(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 2(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_137bytes): > - lddqu 121(%rsi), %xmm0 > - movdqu %xmm0, 121(%rdi) > -L(bwd_write_121bytes): > - lddqu 105(%rsi), %xmm0 > - movdqu %xmm0, 105(%rdi) > -L(bwd_write_105bytes): > - lddqu 89(%rsi), %xmm0 > - movdqu %xmm0, 89(%rdi) > -L(bwd_write_89bytes): > - lddqu 73(%rsi), %xmm0 > - movdqu %xmm0, 73(%rdi) > -L(bwd_write_73bytes): > - lddqu 57(%rsi), %xmm0 > - movdqu %xmm0, 57(%rdi) > -L(bwd_write_57bytes): > - lddqu 41(%rsi), %xmm0 > - movdqu %xmm0, 41(%rdi) > -L(bwd_write_41bytes): > - lddqu 25(%rsi), %xmm0 > - movdqu %xmm0, 25(%rdi) > -L(bwd_write_25bytes): > - lddqu 9(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 9(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_9bytes): > - mov 1(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 1(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_136bytes): > - lddqu 120(%rsi), %xmm0 > - movdqu %xmm0, 120(%rdi) > -L(bwd_write_120bytes): > - lddqu 104(%rsi), %xmm0 > - movdqu %xmm0, 104(%rdi) > -L(bwd_write_104bytes): > - lddqu 88(%rsi), %xmm0 > - movdqu %xmm0, 88(%rdi) > -L(bwd_write_88bytes): > - lddqu 72(%rsi), %xmm0 > - movdqu %xmm0, 72(%rdi) > -L(bwd_write_72bytes): > - lddqu 56(%rsi), %xmm0 > - movdqu %xmm0, 56(%rdi) > -L(bwd_write_56bytes): > - lddqu 40(%rsi), %xmm0 > - movdqu %xmm0, 40(%rdi) > -L(bwd_write_40bytes): > - lddqu 24(%rsi), %xmm0 > - movdqu %xmm0, 24(%rdi) > -L(bwd_write_24bytes): > - lddqu 8(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 8(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_8bytes): > - mov (%rsi), %rdx > - mov %rdx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_135bytes): > - lddqu 119(%rsi), %xmm0 > - movdqu %xmm0, 119(%rdi) > -L(bwd_write_119bytes): > - lddqu 103(%rsi), %xmm0 > - movdqu %xmm0, 103(%rdi) > -L(bwd_write_103bytes): > - lddqu 87(%rsi), %xmm0 > - movdqu %xmm0, 87(%rdi) > -L(bwd_write_87bytes): > - lddqu 71(%rsi), %xmm0 > - movdqu %xmm0, 71(%rdi) > -L(bwd_write_71bytes): > - lddqu 55(%rsi), %xmm0 > - movdqu %xmm0, 55(%rdi) > -L(bwd_write_55bytes): > - lddqu 39(%rsi), %xmm0 > - movdqu %xmm0, 39(%rdi) > -L(bwd_write_39bytes): > - lddqu 23(%rsi), %xmm0 > - movdqu %xmm0, 23(%rdi) > -L(bwd_write_23bytes): > - lddqu 7(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 7(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_7bytes): > - mov 3(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 3(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_134bytes): > - lddqu 118(%rsi), %xmm0 > - movdqu %xmm0, 118(%rdi) > -L(bwd_write_118bytes): > - lddqu 102(%rsi), %xmm0 > - movdqu %xmm0, 102(%rdi) > -L(bwd_write_102bytes): > - lddqu 86(%rsi), %xmm0 > - movdqu %xmm0, 86(%rdi) > -L(bwd_write_86bytes): > - lddqu 70(%rsi), %xmm0 > - movdqu %xmm0, 70(%rdi) > -L(bwd_write_70bytes): > - lddqu 54(%rsi), %xmm0 > - movdqu %xmm0, 54(%rdi) > -L(bwd_write_54bytes): > - lddqu 38(%rsi), %xmm0 > - movdqu %xmm0, 38(%rdi) > -L(bwd_write_38bytes): > - lddqu 22(%rsi), %xmm0 > - movdqu %xmm0, 22(%rdi) > -L(bwd_write_22bytes): > - lddqu 6(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 6(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_6bytes): > - mov 2(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 2(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_133bytes): > - lddqu 117(%rsi), %xmm0 > - movdqu %xmm0, 117(%rdi) > -L(bwd_write_117bytes): > - lddqu 101(%rsi), %xmm0 > - movdqu %xmm0, 101(%rdi) > -L(bwd_write_101bytes): > - lddqu 85(%rsi), %xmm0 > - movdqu %xmm0, 85(%rdi) > -L(bwd_write_85bytes): > - lddqu 69(%rsi), %xmm0 > - movdqu %xmm0, 69(%rdi) > -L(bwd_write_69bytes): > - lddqu 53(%rsi), %xmm0 > - movdqu %xmm0, 53(%rdi) > -L(bwd_write_53bytes): > - lddqu 37(%rsi), %xmm0 > - movdqu %xmm0, 37(%rdi) > -L(bwd_write_37bytes): > - lddqu 21(%rsi), %xmm0 > - movdqu %xmm0, 21(%rdi) > -L(bwd_write_21bytes): > - lddqu 5(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 5(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_5bytes): > - mov 1(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 1(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_132bytes): > - lddqu 116(%rsi), %xmm0 > - movdqu %xmm0, 116(%rdi) > -L(bwd_write_116bytes): > - lddqu 100(%rsi), %xmm0 > - movdqu %xmm0, 100(%rdi) > -L(bwd_write_100bytes): > - lddqu 84(%rsi), %xmm0 > - movdqu %xmm0, 84(%rdi) > -L(bwd_write_84bytes): > - lddqu 68(%rsi), %xmm0 > - movdqu %xmm0, 68(%rdi) > -L(bwd_write_68bytes): > - lddqu 52(%rsi), %xmm0 > - movdqu %xmm0, 52(%rdi) > -L(bwd_write_52bytes): > - lddqu 36(%rsi), %xmm0 > - movdqu %xmm0, 36(%rdi) > -L(bwd_write_36bytes): > - lddqu 20(%rsi), %xmm0 > - movdqu %xmm0, 20(%rdi) > -L(bwd_write_20bytes): > - lddqu 4(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 4(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_4bytes): > - mov (%rsi), %edx > - mov %edx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_131bytes): > - lddqu 115(%rsi), %xmm0 > - movdqu %xmm0, 115(%rdi) > -L(bwd_write_115bytes): > - lddqu 99(%rsi), %xmm0 > - movdqu %xmm0, 99(%rdi) > -L(bwd_write_99bytes): > - lddqu 83(%rsi), %xmm0 > - movdqu %xmm0, 83(%rdi) > -L(bwd_write_83bytes): > - lddqu 67(%rsi), %xmm0 > - movdqu %xmm0, 67(%rdi) > -L(bwd_write_67bytes): > - lddqu 51(%rsi), %xmm0 > - movdqu %xmm0, 51(%rdi) > -L(bwd_write_51bytes): > - lddqu 35(%rsi), %xmm0 > - movdqu %xmm0, 35(%rdi) > -L(bwd_write_35bytes): > - lddqu 19(%rsi), %xmm0 > - movdqu %xmm0, 19(%rdi) > -L(bwd_write_19bytes): > - lddqu 3(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 3(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_3bytes): > - mov 1(%rsi), %dx > - mov (%rsi), %cx > - mov %dx, 1(%rdi) > - mov %cx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_130bytes): > - lddqu 114(%rsi), %xmm0 > - movdqu %xmm0, 114(%rdi) > -L(bwd_write_114bytes): > - lddqu 98(%rsi), %xmm0 > - movdqu %xmm0, 98(%rdi) > -L(bwd_write_98bytes): > - lddqu 82(%rsi), %xmm0 > - movdqu %xmm0, 82(%rdi) > -L(bwd_write_82bytes): > - lddqu 66(%rsi), %xmm0 > - movdqu %xmm0, 66(%rdi) > -L(bwd_write_66bytes): > - lddqu 50(%rsi), %xmm0 > - movdqu %xmm0, 50(%rdi) > -L(bwd_write_50bytes): > - lddqu 34(%rsi), %xmm0 > - movdqu %xmm0, 34(%rdi) > -L(bwd_write_34bytes): > - lddqu 18(%rsi), %xmm0 > - movdqu %xmm0, 18(%rdi) > -L(bwd_write_18bytes): > - lddqu 2(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 2(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_2bytes): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_129bytes): > - lddqu 113(%rsi), %xmm0 > - movdqu %xmm0, 113(%rdi) > -L(bwd_write_113bytes): > - lddqu 97(%rsi), %xmm0 > - movdqu %xmm0, 97(%rdi) > -L(bwd_write_97bytes): > - lddqu 81(%rsi), %xmm0 > - movdqu %xmm0, 81(%rdi) > -L(bwd_write_81bytes): > - lddqu 65(%rsi), %xmm0 > - movdqu %xmm0, 65(%rdi) > -L(bwd_write_65bytes): > - lddqu 49(%rsi), %xmm0 > - movdqu %xmm0, 49(%rdi) > -L(bwd_write_49bytes): > - lddqu 33(%rsi), %xmm0 > - movdqu %xmm0, 33(%rdi) > -L(bwd_write_33bytes): > - lddqu 17(%rsi), %xmm0 > - movdqu %xmm0, 17(%rdi) > -L(bwd_write_17bytes): > - lddqu 1(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 1(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_1bytes): > - movzbl (%rsi), %edx > - mov %dl, (%rdi) > - ret > - > -END (MEMCPY) > - > - .section .rodata.ssse3,"a",@progbits > - .p2align 3 > -L(table_144_bytes_bwd): > - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) > - > - .p2align 3 > -L(table_144_bytes_fwd): > - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) > - > - .p2align 3 > -L(shl_table_fwd): > - .int JMPTBL (L(shl_0), L(shl_table_fwd)) > - .int JMPTBL (L(shl_1), L(shl_table_fwd)) > - .int JMPTBL (L(shl_2), L(shl_table_fwd)) > - .int JMPTBL (L(shl_3), L(shl_table_fwd)) > - .int JMPTBL (L(shl_4), L(shl_table_fwd)) > - .int JMPTBL (L(shl_5), L(shl_table_fwd)) > - .int JMPTBL (L(shl_6), L(shl_table_fwd)) > - .int JMPTBL (L(shl_7), L(shl_table_fwd)) > - .int JMPTBL (L(shl_8), L(shl_table_fwd)) > - .int JMPTBL (L(shl_9), L(shl_table_fwd)) > - .int JMPTBL (L(shl_10), L(shl_table_fwd)) > - .int JMPTBL (L(shl_11), L(shl_table_fwd)) > - .int JMPTBL (L(shl_12), L(shl_table_fwd)) > - .int JMPTBL (L(shl_13), L(shl_table_fwd)) > - .int JMPTBL (L(shl_14), L(shl_table_fwd)) > - .int JMPTBL (L(shl_15), L(shl_table_fwd)) > - > - .p2align 3 > -L(shl_table_bwd): > - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) > - > -#endif > diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > deleted file mode 100644 > index f9a4e9aff9..0000000000 > --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_MEMMOVE > -#define MEMCPY __memmove_ssse3_back > -#define MEMCPY_CHK __memmove_chk_ssse3_back > -#include "memcpy-ssse3-back.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (3 preceding siblings ...) 2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-04-14 16:47 ` Noah Goldstein 2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu 5 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw) To: libc-alpha The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are generally preferable. memcpy/memmove is one exception where avoiding unaligned loads with `palignr` is important for some targets. This commit replaces memmove-ssse3 with a better optimized are lower code footprint verion. As well it aliases memcpy to memmove. Aside from this function all other SSSE3 functions should be safe to remove. The performance is not changed drastically although shows overall improvements without any major regressions or gains. bench-memcpy geometric_mean(N=50) New / Original: 0.957 bench-memcpy-random geometric_mean(N=50) New / Original: 0.912 bench-memcpy-large geometric_mean(N=50) New / Original: 0.892 Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers for all results. More important this saves 7246 bytes of code size in memmove an additional 10741 bytes by reusing memmove code for memcpy (total 17987 bytes saves). As well an additional 896 bytes of rodata for the jump table entries. --- Results For: bench-memcpy length, align1, align2, dst > src, New Time / Old Time 1, 0, 0, 0, 0.946 1, 0, 0, 1, 0.946 1, 32, 0, 0, 0.948 1, 32, 0, 1, 1.185 1, 0, 32, 0, 0.982 1, 0, 32, 1, 1.14 1, 32, 32, 0, 0.981 1, 32, 32, 1, 1.057 1, 2048, 0, 0, 0.945 1, 2048, 0, 1, 0.945 2, 0, 0, 0, 1.041 2, 0, 0, 1, 1.041 2, 1, 0, 0, 1.044 2, 1, 0, 1, 1.044 2, 33, 0, 0, 1.044 2, 33, 0, 1, 1.044 2, 0, 1, 0, 1.041 2, 0, 1, 1, 1.041 2, 0, 33, 0, 1.042 2, 0, 33, 1, 1.041 2, 1, 1, 0, 1.041 2, 1, 1, 1, 1.041 2, 33, 33, 0, 1.041 2, 33, 33, 1, 1.041 2, 2048, 0, 0, 1.042 2, 2048, 0, 1, 1.041 2, 2049, 0, 0, 1.044 2, 2049, 0, 1, 1.044 2, 2048, 1, 0, 1.041 2, 2048, 1, 1, 1.042 2, 2049, 1, 0, 1.042 2, 2049, 1, 1, 1.042 4, 0, 0, 0, 0.962 4, 0, 0, 1, 0.962 4, 2, 0, 0, 0.98 4, 2, 0, 1, 0.984 4, 34, 0, 0, 0.986 4, 34, 0, 1, 0.987 4, 0, 2, 0, 0.962 4, 0, 2, 1, 0.962 4, 0, 34, 0, 0.962 4, 0, 34, 1, 0.962 4, 2, 2, 0, 0.962 4, 2, 2, 1, 0.962 4, 34, 34, 0, 0.962 4, 34, 34, 1, 0.962 4, 2048, 0, 0, 0.962 4, 2048, 0, 1, 0.962 4, 2050, 0, 0, 0.996 4, 2050, 0, 1, 1.0 4, 2048, 2, 0, 0.962 4, 2048, 2, 1, 0.962 4, 2050, 2, 0, 0.962 4, 2050, 2, 1, 0.962 8, 0, 0, 0, 0.962 8, 0, 0, 1, 0.962 8, 3, 0, 0, 1.0 8, 3, 0, 1, 1.0 8, 35, 0, 0, 1.001 8, 35, 0, 1, 1.0 8, 0, 3, 0, 0.962 8, 0, 3, 1, 0.962 8, 0, 35, 0, 0.962 8, 0, 35, 1, 0.962 8, 3, 3, 0, 0.962 8, 3, 3, 1, 0.962 8, 35, 35, 0, 0.962 8, 35, 35, 1, 0.962 8, 2048, 0, 0, 0.962 8, 2048, 0, 1, 0.962 8, 2051, 0, 0, 1.0 8, 2051, 0, 1, 1.0 8, 2048, 3, 0, 0.962 8, 2048, 3, 1, 0.962 8, 2051, 3, 0, 0.962 8, 2051, 3, 1, 0.962 16, 0, 0, 0, 0.798 16, 0, 0, 1, 0.799 16, 4, 0, 0, 0.801 16, 4, 0, 1, 0.801 16, 36, 0, 0, 0.801 16, 36, 0, 1, 0.801 16, 0, 4, 0, 0.798 16, 0, 4, 1, 0.799 16, 0, 36, 0, 0.799 16, 0, 36, 1, 0.799 16, 4, 4, 0, 0.799 16, 4, 4, 1, 0.799 16, 36, 36, 0, 0.799 16, 36, 36, 1, 0.799 16, 2048, 0, 0, 0.799 16, 2048, 0, 1, 0.799 16, 2052, 0, 0, 0.801 16, 2052, 0, 1, 0.801 16, 2048, 4, 0, 0.798 16, 2048, 4, 1, 0.799 16, 2052, 4, 0, 0.799 16, 2052, 4, 1, 0.799 32, 0, 0, 0, 0.472 32, 0, 0, 1, 0.472 32, 5, 0, 0, 0.472 32, 5, 0, 1, 0.472 32, 37, 0, 0, 0.962 32, 37, 0, 1, 0.962 32, 0, 5, 0, 0.472 32, 0, 5, 1, 0.472 32, 0, 37, 0, 1.021 32, 0, 37, 1, 1.021 32, 5, 5, 0, 0.472 32, 5, 5, 1, 0.472 32, 37, 37, 0, 1.011 32, 37, 37, 1, 1.011 32, 2048, 0, 0, 0.472 32, 2048, 0, 1, 0.472 32, 2053, 0, 0, 0.472 32, 2053, 0, 1, 0.472 32, 2048, 5, 0, 0.472 32, 2048, 5, 1, 0.472 32, 2053, 5, 0, 0.472 32, 2053, 5, 1, 0.472 64, 0, 0, 0, 1.0 64, 0, 0, 1, 1.0 64, 6, 0, 0, 0.862 64, 6, 0, 1, 0.862 64, 38, 0, 0, 0.912 64, 38, 0, 1, 0.912 64, 0, 6, 0, 0.896 64, 0, 6, 1, 0.896 64, 0, 38, 0, 0.906 64, 0, 38, 1, 0.906 64, 6, 6, 0, 0.91 64, 6, 6, 1, 0.91 64, 38, 38, 0, 0.883 64, 38, 38, 1, 0.883 64, 2048, 0, 0, 1.0 64, 2048, 0, 1, 1.0 64, 2054, 0, 0, 0.862 64, 2054, 0, 1, 0.862 64, 2048, 6, 0, 0.887 64, 2048, 6, 1, 0.887 64, 2054, 6, 0, 0.887 64, 2054, 6, 1, 0.887 128, 0, 0, 0, 0.857 128, 0, 0, 1, 0.857 128, 7, 0, 0, 0.875 128, 7, 0, 1, 0.875 128, 39, 0, 0, 0.892 128, 39, 0, 1, 0.892 128, 0, 7, 0, 1.183 128, 0, 7, 1, 1.183 128, 0, 39, 0, 1.113 128, 0, 39, 1, 1.113 128, 7, 7, 0, 0.692 128, 7, 7, 1, 0.692 128, 39, 39, 0, 1.104 128, 39, 39, 1, 1.104 128, 2048, 0, 0, 0.857 128, 2048, 0, 1, 0.857 128, 2055, 0, 0, 0.875 128, 2055, 0, 1, 0.875 128, 2048, 7, 0, 0.959 128, 2048, 7, 1, 0.959 128, 2055, 7, 0, 1.036 128, 2055, 7, 1, 1.036 256, 0, 0, 0, 0.889 256, 0, 0, 1, 0.889 256, 8, 0, 0, 0.966 256, 8, 0, 1, 0.966 256, 40, 0, 0, 0.983 256, 40, 0, 1, 0.983 256, 0, 8, 0, 1.29 256, 0, 8, 1, 1.29 256, 0, 40, 0, 1.274 256, 0, 40, 1, 1.274 256, 8, 8, 0, 0.865 256, 8, 8, 1, 0.865 256, 40, 40, 0, 1.477 256, 40, 40, 1, 1.477 256, 2048, 0, 0, 0.889 256, 2048, 0, 1, 0.889 256, 2056, 0, 0, 0.966 256, 2056, 0, 1, 0.966 256, 2048, 8, 0, 0.952 256, 2048, 8, 1, 0.952 256, 2056, 8, 0, 0.878 256, 2056, 8, 1, 0.878 512, 0, 0, 0, 1.077 512, 0, 0, 1, 1.077 512, 9, 0, 0, 1.0 512, 9, 0, 1, 1.0 512, 41, 0, 0, 0.954 512, 41, 0, 1, 0.954 512, 0, 9, 0, 1.191 512, 0, 9, 1, 1.191 512, 0, 41, 0, 1.181 512, 0, 41, 1, 1.181 512, 9, 9, 0, 0.765 512, 9, 9, 1, 0.765 512, 41, 41, 0, 0.905 512, 41, 41, 1, 0.905 512, 2048, 0, 0, 1.077 512, 2048, 0, 1, 1.077 512, 2057, 0, 0, 1.0 512, 2057, 0, 1, 1.0 512, 2048, 9, 0, 1.0 512, 2048, 9, 1, 1.0 512, 2057, 9, 0, 0.733 512, 2057, 9, 1, 0.733 1024, 0, 0, 0, 1.143 1024, 0, 0, 1, 1.143 1024, 10, 0, 0, 1.015 1024, 10, 0, 1, 1.015 1024, 42, 0, 0, 1.045 1024, 42, 0, 1, 1.045 1024, 0, 10, 0, 1.126 1024, 0, 10, 1, 1.126 1024, 0, 42, 0, 1.114 1024, 0, 42, 1, 1.114 1024, 10, 10, 0, 0.89 1024, 10, 10, 1, 0.89 1024, 42, 42, 0, 0.986 1024, 42, 42, 1, 0.986 1024, 2048, 0, 0, 1.143 1024, 2048, 0, 1, 1.143 1024, 2058, 0, 0, 1.015 1024, 2058, 0, 1, 1.015 1024, 2048, 10, 0, 1.03 1024, 2048, 10, 1, 1.03 1024, 2058, 10, 0, 0.854 1024, 2058, 10, 1, 0.854 2048, 0, 0, 0, 1.005 2048, 0, 0, 1, 1.005 2048, 11, 0, 0, 1.013 2048, 11, 0, 1, 1.014 2048, 43, 0, 0, 1.044 2048, 43, 0, 1, 1.044 2048, 0, 11, 0, 1.002 2048, 0, 11, 1, 1.003 2048, 0, 43, 0, 1.003 2048, 0, 43, 1, 1.003 2048, 11, 11, 0, 0.92 2048, 11, 11, 1, 0.92 2048, 43, 43, 0, 1.0 2048, 43, 43, 1, 1.0 2048, 2048, 0, 0, 1.005 2048, 2048, 0, 1, 1.005 2048, 2059, 0, 0, 0.904 2048, 2059, 0, 1, 0.904 2048, 2048, 11, 0, 1.0 2048, 2048, 11, 1, 1.0 2048, 2059, 11, 0, 0.979 2048, 2059, 11, 1, 0.979 4096, 0, 0, 0, 1.014 4096, 0, 0, 1, 1.014 4096, 12, 0, 0, 0.855 4096, 12, 0, 1, 0.855 4096, 44, 0, 0, 0.857 4096, 44, 0, 1, 0.857 4096, 0, 12, 0, 0.932 4096, 0, 12, 1, 0.932 4096, 0, 44, 0, 0.932 4096, 0, 44, 1, 0.933 4096, 12, 12, 0, 0.999 4096, 12, 12, 1, 0.999 4096, 44, 44, 0, 1.051 4096, 44, 44, 1, 1.051 4096, 2048, 0, 0, 1.014 4096, 2048, 0, 1, 1.014 4096, 2060, 0, 0, 0.967 4096, 2060, 0, 1, 0.967 4096, 2048, 12, 0, 0.769 4096, 2048, 12, 1, 0.769 4096, 2060, 12, 0, 0.943 4096, 2060, 12, 1, 0.943 8192, 0, 0, 0, 1.045 8192, 0, 0, 1, 1.046 8192, 13, 0, 0, 0.885 8192, 13, 0, 1, 0.885 8192, 45, 0, 0, 0.887 8192, 45, 0, 1, 0.887 8192, 0, 13, 0, 0.942 8192, 0, 13, 1, 0.942 8192, 0, 45, 0, 0.942 8192, 0, 45, 1, 0.942 8192, 13, 13, 0, 1.03 8192, 13, 13, 1, 1.029 8192, 45, 45, 0, 1.048 8192, 45, 45, 1, 1.049 8192, 2048, 0, 0, 1.048 8192, 2048, 0, 1, 1.048 8192, 2061, 0, 0, 1.011 8192, 2061, 0, 1, 1.011 8192, 2048, 13, 0, 0.789 8192, 2048, 13, 1, 0.788 8192, 2061, 13, 0, 0.991 8192, 2061, 13, 1, 0.992 16384, 0, 0, 0, 1.026 16384, 0, 0, 1, 1.011 16384, 14, 0, 0, 0.943 16384, 14, 0, 1, 0.95 16384, 46, 0, 0, 0.856 16384, 46, 0, 1, 0.86 16384, 0, 14, 0, 0.815 16384, 0, 14, 1, 0.817 16384, 0, 46, 0, 0.859 16384, 0, 46, 1, 0.867 16384, 14, 14, 0, 0.987 16384, 14, 14, 1, 0.979 16384, 46, 46, 0, 1.027 16384, 46, 46, 1, 1.031 16384, 2048, 0, 0, 1.078 16384, 2048, 0, 1, 1.084 16384, 2062, 0, 0, 0.851 16384, 2062, 0, 1, 0.85 16384, 2048, 14, 0, 0.935 16384, 2048, 14, 1, 0.932 16384, 2062, 14, 0, 1.015 16384, 2062, 14, 1, 1.012 32768, 0, 0, 0, 0.978 32768, 0, 0, 1, 0.979 32768, 15, 0, 0, 1.006 32768, 15, 0, 1, 1.006 32768, 47, 0, 0, 1.004 32768, 47, 0, 1, 1.004 32768, 0, 15, 0, 1.045 32768, 0, 15, 1, 1.045 32768, 0, 47, 0, 1.011 32768, 0, 47, 1, 1.011 32768, 15, 15, 0, 0.977 32768, 15, 15, 1, 0.977 32768, 47, 47, 0, 0.96 32768, 47, 47, 1, 0.96 32768, 2048, 0, 0, 0.978 32768, 2048, 0, 1, 0.978 32768, 2063, 0, 0, 1.004 32768, 2063, 0, 1, 1.004 32768, 2048, 15, 0, 1.036 32768, 2048, 15, 1, 1.036 32768, 2063, 15, 0, 0.978 32768, 2063, 15, 1, 0.978 65536, 0, 0, 0, 0.981 65536, 0, 0, 1, 0.981 65536, 16, 0, 0, 0.987 65536, 16, 0, 1, 0.987 65536, 48, 0, 0, 0.968 65536, 48, 0, 1, 0.968 65536, 0, 16, 0, 1.014 65536, 0, 16, 1, 1.014 65536, 0, 48, 0, 0.984 65536, 0, 48, 1, 0.984 65536, 16, 16, 0, 1.01 65536, 16, 16, 1, 1.01 65536, 48, 48, 0, 0.968 65536, 48, 48, 1, 0.968 65536, 2048, 0, 0, 0.982 65536, 2048, 0, 1, 0.982 65536, 2064, 0, 0, 0.987 65536, 2064, 0, 1, 0.987 65536, 2048, 16, 0, 1.012 65536, 2048, 16, 1, 1.012 65536, 2064, 16, 0, 1.007 65536, 2064, 16, 1, 1.007 0, 0, 0, 0, 0.867 0, 2048, 0, 0, 0.867 0, 4095, 0, 0, 0.868 0, 0, 4095, 0, 0.866 1, 1, 0, 0, 1.108 1, 0, 1, 0, 0.946 1, 1, 1, 0, 0.946 1, 2049, 0, 0, 0.947 1, 2048, 1, 0, 0.945 1, 2049, 1, 0, 0.945 1, 4095, 0, 0, 1.482 1, 0, 4095, 0, 0.981 2, 2, 0, 0, 1.044 2, 0, 2, 0, 1.041 2, 2, 2, 0, 1.041 2, 2050, 0, 0, 1.044 2, 2048, 2, 0, 1.042 2, 2050, 2, 0, 1.041 2, 4095, 0, 0, 1.057 2, 0, 4095, 0, 1.022 3, 0, 0, 0, 0.899 3, 3, 0, 0, 0.902 3, 0, 3, 0, 0.9 3, 3, 3, 0, 0.9 3, 2048, 0, 0, 0.9 3, 2051, 0, 0, 0.902 3, 2048, 3, 0, 0.9 3, 2051, 3, 0, 0.9 3, 4095, 0, 0, 0.261 3, 0, 4095, 0, 0.211 4, 4, 0, 0, 0.965 4, 0, 4, 0, 0.962 4, 4, 4, 0, 0.962 4, 2052, 0, 0, 0.969 4, 2048, 4, 0, 0.962 4, 2052, 4, 0, 0.962 4, 4095, 0, 0, 1.971 4, 0, 4095, 0, 1.988 5, 0, 0, 0, 0.898 5, 5, 0, 0, 0.9 5, 0, 5, 0, 0.898 5, 5, 5, 0, 0.898 5, 2048, 0, 0, 0.898 5, 2053, 0, 0, 0.9 5, 2048, 5, 0, 0.898 5, 2053, 5, 0, 0.898 5, 4095, 0, 0, 0.935 5, 0, 4095, 0, 1.02 6, 0, 0, 0, 0.898 6, 6, 0, 0, 0.9 6, 0, 6, 0, 0.898 6, 6, 6, 0, 0.898 6, 2048, 0, 0, 0.898 6, 2054, 0, 0, 0.9 6, 2048, 6, 0, 0.898 6, 2054, 6, 0, 0.898 6, 4095, 0, 0, 0.935 6, 0, 4095, 0, 1.021 7, 0, 0, 0, 0.898 7, 7, 0, 0, 0.9 7, 0, 7, 0, 0.898 7, 7, 7, 0, 0.898 7, 2048, 0, 0, 0.898 7, 2055, 0, 0, 0.9 7, 2048, 7, 0, 0.898 7, 2055, 7, 0, 0.898 7, 4095, 0, 0, 0.935 7, 0, 4095, 0, 1.021 8, 8, 0, 0, 1.001 8, 0, 8, 0, 0.962 8, 8, 8, 0, 0.962 8, 2056, 0, 0, 1.0 8, 2048, 8, 0, 0.962 8, 2056, 8, 0, 0.962 8, 4095, 0, 0, 1.971 8, 0, 4095, 0, 1.988 9, 0, 0, 0, 0.898 9, 9, 0, 0, 0.9 9, 0, 9, 0, 0.899 9, 9, 9, 0, 0.899 9, 2048, 0, 0, 0.899 9, 2057, 0, 0, 0.9 9, 2048, 9, 0, 0.899 9, 2057, 9, 0, 0.899 9, 4095, 0, 0, 0.935 9, 0, 4095, 0, 1.019 10, 0, 0, 0, 0.898 10, 10, 0, 0, 0.9 10, 0, 10, 0, 0.899 10, 10, 10, 0, 0.899 10, 2048, 0, 0, 0.899 10, 2058, 0, 0, 0.9 10, 2048, 10, 0, 0.899 10, 2058, 10, 0, 0.899 10, 4095, 0, 0, 0.935 10, 0, 4095, 0, 1.02 11, 0, 0, 0, 0.898 11, 11, 0, 0, 0.9 11, 0, 11, 0, 0.899 11, 11, 11, 0, 0.899 11, 2048, 0, 0, 0.899 11, 2059, 0, 0, 0.9 11, 2048, 11, 0, 0.899 11, 2059, 11, 0, 0.899 11, 4095, 0, 0, 0.935 11, 0, 4095, 0, 1.02 12, 0, 0, 0, 0.898 12, 12, 0, 0, 0.9 12, 0, 12, 0, 0.899 12, 12, 12, 0, 0.899 12, 2048, 0, 0, 0.899 12, 2060, 0, 0, 0.9 12, 2048, 12, 0, 0.899 12, 2060, 12, 0, 0.899 12, 4095, 0, 0, 0.935 12, 0, 4095, 0, 1.018 13, 0, 0, 0, 0.897 13, 13, 0, 0, 0.901 13, 0, 13, 0, 0.898 13, 13, 13, 0, 0.898 13, 2048, 0, 0, 0.898 13, 2061, 0, 0, 0.9 13, 2048, 13, 0, 0.898 13, 2061, 13, 0, 0.898 13, 4095, 0, 0, 0.935 13, 0, 4095, 0, 1.019 14, 0, 0, 0, 0.897 14, 14, 0, 0, 0.9 14, 0, 14, 0, 0.898 14, 14, 14, 0, 0.898 14, 2048, 0, 0, 0.898 14, 2062, 0, 0, 0.9 14, 2048, 14, 0, 0.898 14, 2062, 14, 0, 0.898 14, 4095, 0, 0, 0.935 14, 0, 4095, 0, 1.02 15, 0, 0, 0, 0.897 15, 15, 0, 0, 0.901 15, 0, 15, 0, 0.898 15, 15, 15, 0, 0.898 15, 2048, 0, 0, 0.898 15, 2063, 0, 0, 0.9 15, 2048, 15, 0, 0.898 15, 2063, 15, 0, 0.898 15, 4095, 0, 0, 0.935 15, 0, 4095, 0, 1.02 16, 16, 0, 0, 0.801 16, 0, 16, 0, 0.799 16, 16, 16, 0, 0.799 16, 2064, 0, 0, 0.801 16, 2048, 16, 0, 0.799 16, 2064, 16, 0, 0.799 16, 4095, 0, 0, 1.818 16, 0, 4095, 0, 1.957 17, 0, 0, 0, 0.798 17, 17, 0, 0, 0.801 17, 0, 17, 0, 0.799 17, 17, 17, 0, 0.799 17, 2048, 0, 0, 0.799 17, 2065, 0, 0, 0.801 17, 2048, 17, 0, 0.799 17, 2065, 17, 0, 0.799 17, 4095, 0, 0, 0.938 17, 0, 4095, 0, 1.021 18, 0, 0, 0, 0.798 18, 18, 0, 0, 0.801 18, 0, 18, 0, 0.799 18, 18, 18, 0, 0.799 18, 2048, 0, 0, 0.799 18, 2066, 0, 0, 0.801 18, 2048, 18, 0, 0.799 18, 2066, 18, 0, 0.799 18, 4095, 0, 0, 0.938 18, 0, 4095, 0, 1.021 19, 0, 0, 0, 0.798 19, 19, 0, 0, 0.801 19, 0, 19, 0, 0.799 19, 19, 19, 0, 0.799 19, 2048, 0, 0, 0.799 19, 2067, 0, 0, 0.801 19, 2048, 19, 0, 0.799 19, 2067, 19, 0, 0.799 19, 4095, 0, 0, 0.938 19, 0, 4095, 0, 1.021 20, 0, 0, 0, 0.798 20, 20, 0, 0, 0.801 20, 0, 20, 0, 0.799 20, 20, 20, 0, 0.799 20, 2048, 0, 0, 0.799 20, 2068, 0, 0, 0.801 20, 2048, 20, 0, 0.799 20, 2068, 20, 0, 0.799 20, 4095, 0, 0, 0.937 20, 0, 4095, 0, 1.021 21, 0, 0, 0, 0.798 21, 21, 0, 0, 0.801 21, 0, 21, 0, 0.799 21, 21, 21, 0, 0.799 21, 2048, 0, 0, 0.799 21, 2069, 0, 0, 0.801 21, 2048, 21, 0, 0.799 21, 2069, 21, 0, 0.799 21, 4095, 0, 0, 0.938 21, 0, 4095, 0, 1.021 22, 0, 0, 0, 0.798 22, 22, 0, 0, 0.801 22, 0, 22, 0, 0.799 22, 22, 22, 0, 0.799 22, 2048, 0, 0, 0.799 22, 2070, 0, 0, 0.801 22, 2048, 22, 0, 0.799 22, 2070, 22, 0, 0.799 22, 4095, 0, 0, 0.938 22, 0, 4095, 0, 1.021 23, 0, 0, 0, 0.798 23, 23, 0, 0, 0.801 23, 0, 23, 0, 0.799 23, 23, 23, 0, 0.799 23, 2048, 0, 0, 0.799 23, 2071, 0, 0, 0.801 23, 2048, 23, 0, 0.799 23, 2071, 23, 0, 0.799 23, 4095, 0, 0, 0.938 23, 0, 4095, 0, 1.021 24, 0, 0, 0, 0.798 24, 24, 0, 0, 0.801 24, 0, 24, 0, 0.799 24, 24, 24, 0, 0.799 24, 2048, 0, 0, 0.799 24, 2072, 0, 0, 0.801 24, 2048, 24, 0, 0.799 24, 2072, 24, 0, 0.799 24, 4095, 0, 0, 0.937 24, 0, 4095, 0, 1.021 25, 0, 0, 0, 0.501 25, 25, 0, 0, 0.502 25, 0, 25, 0, 0.502 25, 25, 25, 0, 0.501 25, 2048, 0, 0, 0.501 25, 2073, 0, 0, 0.502 25, 2048, 25, 0, 0.502 25, 2073, 25, 0, 0.501 25, 4095, 0, 0, 0.974 25, 0, 4095, 0, 0.98 26, 0, 0, 0, 0.501 26, 26, 0, 0, 0.502 26, 0, 26, 0, 0.502 26, 26, 26, 0, 0.501 26, 2048, 0, 0, 0.501 26, 2074, 0, 0, 0.502 26, 2048, 26, 0, 0.502 26, 2074, 26, 0, 0.501 26, 4095, 0, 0, 0.974 26, 0, 4095, 0, 1.0 27, 0, 0, 0, 0.501 27, 27, 0, 0, 0.502 27, 0, 27, 0, 0.502 27, 27, 27, 0, 0.501 27, 2048, 0, 0, 0.501 27, 2075, 0, 0, 0.502 27, 2048, 27, 0, 0.502 27, 2075, 27, 0, 0.501 27, 4095, 0, 0, 0.974 27, 0, 4095, 0, 1.0 28, 0, 0, 0, 0.501 28, 28, 0, 0, 0.502 28, 0, 28, 0, 0.502 28, 28, 28, 0, 0.501 28, 2048, 0, 0, 0.501 28, 2076, 0, 0, 0.502 28, 2048, 28, 0, 0.502 28, 2076, 28, 0, 0.502 28, 4095, 0, 0, 0.974 28, 0, 4095, 0, 1.0 29, 0, 0, 0, 0.472 29, 29, 0, 0, 0.472 29, 0, 29, 0, 0.472 29, 29, 29, 0, 0.472 29, 2048, 0, 0, 0.472 29, 2077, 0, 0, 0.472 29, 2048, 29, 0, 0.472 29, 2077, 29, 0, 0.472 29, 4095, 0, 0, 0.974 29, 0, 4095, 0, 1.0 30, 0, 0, 0, 0.472 30, 30, 0, 0, 0.472 30, 0, 30, 0, 0.472 30, 30, 30, 0, 0.472 30, 2048, 0, 0, 0.472 30, 2078, 0, 0, 0.472 30, 2048, 30, 0, 0.472 30, 2078, 30, 0, 0.472 30, 4095, 0, 0, 0.974 30, 0, 4095, 0, 1.0 31, 0, 0, 0, 0.472 31, 31, 0, 0, 0.472 31, 0, 31, 0, 0.472 31, 31, 31, 0, 0.472 31, 2048, 0, 0, 0.472 31, 2079, 0, 0, 0.472 31, 2048, 31, 0, 0.472 31, 2079, 31, 0, 0.472 31, 4095, 0, 0, 0.974 31, 0, 4095, 0, 1.0 48, 0, 0, 0, 1.0 48, 0, 0, 1, 1.0 48, 3, 0, 0, 1.0 48, 3, 0, 1, 1.0 48, 0, 3, 0, 1.0 48, 0, 3, 1, 1.0 48, 3, 3, 0, 1.0 48, 3, 3, 1, 1.0 48, 2048, 0, 0, 1.0 48, 2048, 0, 1, 1.0 48, 2051, 0, 0, 1.0 48, 2051, 0, 1, 1.0 48, 2048, 3, 0, 1.0 48, 2048, 3, 1, 1.0 48, 2051, 3, 0, 1.0 48, 2051, 3, 1, 1.0 80, 0, 0, 0, 0.781 80, 0, 0, 1, 0.782 80, 5, 0, 0, 0.976 80, 5, 0, 1, 0.976 80, 0, 5, 0, 1.232 80, 0, 5, 1, 1.232 80, 5, 5, 0, 1.542 80, 5, 5, 1, 1.543 80, 2048, 0, 0, 0.781 80, 2048, 0, 1, 0.782 80, 2053, 0, 0, 0.976 80, 2053, 0, 1, 0.976 80, 2048, 5, 0, 1.093 80, 2048, 5, 1, 1.093 80, 2053, 5, 0, 1.371 80, 2053, 5, 1, 1.371 96, 0, 0, 0, 0.758 96, 0, 0, 1, 0.758 96, 6, 0, 0, 0.929 96, 6, 0, 1, 0.929 96, 0, 6, 0, 1.204 96, 0, 6, 1, 1.204 96, 6, 6, 0, 1.559 96, 6, 6, 1, 1.562 96, 2048, 0, 0, 0.758 96, 2048, 0, 1, 0.758 96, 2054, 0, 0, 0.929 96, 2054, 0, 1, 0.929 96, 2048, 6, 0, 1.068 96, 2048, 6, 1, 1.068 96, 2054, 6, 0, 1.562 96, 2054, 6, 1, 1.562 112, 0, 0, 0, 0.736 112, 0, 0, 1, 0.736 112, 7, 0, 0, 0.675 112, 7, 0, 1, 0.675 112, 0, 7, 0, 0.778 112, 0, 7, 1, 0.778 112, 7, 7, 0, 0.909 112, 7, 7, 1, 0.909 112, 2048, 0, 0, 0.736 112, 2048, 0, 1, 0.736 112, 2055, 0, 0, 0.675 112, 2055, 0, 1, 0.675 112, 2048, 7, 0, 0.778 112, 2048, 7, 1, 0.778 112, 2055, 7, 0, 0.909 112, 2055, 7, 1, 0.909 144, 0, 0, 0, 0.857 144, 0, 0, 1, 0.857 144, 9, 0, 0, 0.939 144, 9, 0, 1, 0.939 144, 0, 9, 0, 1.137 144, 0, 9, 1, 1.137 144, 9, 9, 0, 1.514 144, 9, 9, 1, 1.514 144, 2048, 0, 0, 0.857 144, 2048, 0, 1, 0.857 144, 2057, 0, 0, 0.939 144, 2057, 0, 1, 0.939 144, 2048, 9, 0, 0.922 144, 2048, 9, 1, 0.922 144, 2057, 9, 0, 1.514 144, 2057, 9, 1, 1.514 160, 0, 0, 0, 0.698 160, 0, 0, 1, 0.698 160, 10, 0, 0, 0.91 160, 10, 0, 1, 0.91 160, 0, 10, 0, 1.211 160, 0, 10, 1, 1.212 160, 10, 10, 0, 1.357 160, 10, 10, 1, 1.357 160, 2048, 0, 0, 0.698 160, 2048, 0, 1, 0.698 160, 2058, 0, 0, 0.91 160, 2058, 0, 1, 0.91 160, 2048, 10, 0, 0.923 160, 2048, 10, 1, 0.923 160, 2058, 10, 0, 1.357 160, 2058, 10, 1, 1.357 176, 0, 0, 0, 0.796 176, 0, 0, 1, 0.796 176, 11, 0, 0, 0.804 176, 11, 0, 1, 0.804 176, 0, 11, 0, 0.774 176, 0, 11, 1, 0.774 176, 11, 11, 0, 0.814 176, 11, 11, 1, 0.814 176, 2048, 0, 0, 0.796 176, 2048, 0, 1, 0.796 176, 2059, 0, 0, 0.804 176, 2059, 0, 1, 0.804 176, 2048, 11, 0, 0.774 176, 2048, 11, 1, 0.774 176, 2059, 11, 0, 0.814 176, 2059, 11, 1, 0.814 192, 0, 0, 0, 0.778 192, 0, 0, 1, 0.778 192, 12, 0, 0, 0.881 192, 12, 0, 1, 0.881 192, 0, 12, 0, 1.167 192, 0, 12, 1, 1.167 192, 12, 12, 0, 0.841 192, 12, 12, 1, 0.841 192, 2048, 0, 0, 0.778 192, 2048, 0, 1, 0.778 192, 2060, 0, 0, 0.881 192, 2060, 0, 1, 0.881 192, 2048, 12, 0, 0.889 192, 2048, 12, 1, 0.889 192, 2060, 12, 0, 0.906 192, 2060, 12, 1, 0.906 208, 0, 0, 0, 0.833 208, 0, 0, 1, 0.833 208, 13, 0, 0, 0.921 208, 13, 0, 1, 0.921 208, 0, 13, 0, 1.003 208, 0, 13, 1, 0.85 208, 13, 13, 0, 1.333 208, 13, 13, 1, 1.333 208, 2048, 0, 0, 0.834 208, 2048, 0, 1, 0.833 208, 2061, 0, 0, 0.921 208, 2061, 0, 1, 0.921 208, 2048, 13, 0, 0.833 208, 2048, 13, 1, 0.833 208, 2061, 13, 0, 1.333 208, 2061, 13, 1, 1.333 224, 0, 0, 0, 0.93 224, 0, 0, 1, 0.93 224, 14, 0, 0, 1.0 224, 14, 0, 1, 1.0 224, 0, 14, 0, 1.15 224, 0, 14, 1, 1.15 224, 14, 14, 0, 1.452 224, 14, 14, 1, 1.452 224, 2048, 0, 0, 0.93 224, 2048, 0, 1, 0.93 224, 2062, 0, 0, 1.0 224, 2062, 0, 1, 1.0 224, 2048, 14, 0, 0.833 224, 2048, 14, 1, 0.833 224, 2062, 14, 0, 1.452 224, 2062, 14, 1, 1.452 240, 0, 0, 0, 0.909 240, 0, 0, 1, 0.909 240, 15, 0, 0, 0.797 240, 15, 0, 1, 0.797 240, 0, 15, 0, 0.771 240, 0, 15, 1, 0.771 240, 15, 15, 0, 0.93 240, 15, 15, 1, 0.93 240, 2048, 0, 0, 0.909 240, 2048, 0, 1, 0.909 240, 2063, 0, 0, 0.797 240, 2063, 0, 1, 0.797 240, 2048, 15, 0, 0.771 240, 2048, 15, 1, 0.771 240, 2063, 15, 0, 0.93 240, 2063, 15, 1, 0.93 272, 0, 0, 0, 0.9 272, 0, 0, 1, 0.9 272, 17, 0, 0, 1.015 272, 17, 0, 1, 1.015 272, 0, 17, 0, 0.927 272, 0, 17, 1, 0.927 272, 17, 17, 0, 0.892 272, 17, 17, 1, 0.892 272, 2048, 0, 0, 0.9 272, 2048, 0, 1, 0.9 272, 2065, 0, 0, 1.015 272, 2065, 0, 1, 1.015 272, 2048, 17, 0, 0.927 272, 2048, 17, 1, 0.927 272, 2065, 17, 0, 0.878 272, 2065, 17, 1, 0.878 288, 0, 0, 0, 0.882 288, 0, 0, 1, 0.882 288, 18, 0, 0, 0.803 288, 18, 0, 1, 0.803 288, 0, 18, 0, 0.768 288, 0, 18, 1, 0.768 288, 18, 18, 0, 0.882 288, 18, 18, 1, 0.882 288, 2048, 0, 0, 0.882 288, 2048, 0, 1, 0.882 288, 2066, 0, 0, 0.803 288, 2066, 0, 1, 0.803 288, 2048, 18, 0, 0.768 288, 2048, 18, 1, 0.768 288, 2066, 18, 0, 0.882 288, 2066, 18, 1, 0.882 304, 0, 0, 0, 0.865 304, 0, 0, 1, 0.866 304, 19, 0, 0, 0.944 304, 19, 0, 1, 0.944 304, 0, 19, 0, 0.943 304, 0, 19, 1, 0.943 304, 19, 19, 0, 0.956 304, 19, 19, 1, 0.956 304, 2048, 0, 0, 0.865 304, 2048, 0, 1, 0.865 304, 2067, 0, 0, 0.944 304, 2067, 0, 1, 0.944 304, 2048, 19, 0, 0.943 304, 2048, 19, 1, 0.943 304, 2067, 19, 0, 0.947 304, 2067, 19, 1, 0.947 320, 0, 0, 0, 0.944 320, 0, 0, 1, 0.944 320, 20, 0, 0, 0.962 320, 20, 0, 1, 0.962 320, 0, 20, 0, 1.214 320, 0, 20, 1, 1.214 320, 20, 20, 0, 1.365 320, 20, 20, 1, 1.365 320, 2048, 0, 0, 0.944 320, 2048, 0, 1, 0.944 320, 2068, 0, 0, 0.962 320, 2068, 0, 1, 0.962 320, 2048, 20, 0, 0.914 320, 2048, 20, 1, 0.914 320, 2068, 20, 0, 1.365 320, 2068, 20, 1, 1.365 336, 0, 0, 0, 1.0 336, 0, 0, 1, 1.0 336, 21, 0, 0, 0.986 336, 21, 0, 1, 0.986 336, 0, 21, 0, 0.853 336, 0, 21, 1, 0.853 336, 21, 21, 0, 0.843 336, 21, 21, 1, 0.843 336, 2048, 0, 0, 1.0 336, 2048, 0, 1, 1.0 336, 2069, 0, 0, 0.986 336, 2069, 0, 1, 0.986 336, 2048, 21, 0, 0.853 336, 2048, 21, 1, 0.853 336, 2069, 21, 0, 0.831 336, 2069, 21, 1, 0.831 352, 0, 0, 0, 0.98 352, 0, 0, 1, 0.98 352, 22, 0, 0, 0.811 352, 22, 0, 1, 0.811 352, 0, 22, 0, 0.882 352, 0, 22, 1, 0.882 352, 22, 22, 0, 1.1 352, 22, 22, 1, 1.1 352, 2048, 0, 0, 0.98 352, 2048, 0, 1, 0.98 352, 2070, 0, 0, 0.811 352, 2070, 0, 1, 0.811 352, 2048, 22, 0, 0.882 352, 2048, 22, 1, 0.882 352, 2070, 22, 0, 1.1 352, 2070, 22, 1, 1.1 368, 0, 0, 0, 1.058 368, 0, 0, 1, 1.058 368, 23, 0, 0, 1.0 368, 23, 0, 1, 1.0 368, 0, 23, 0, 0.948 368, 0, 23, 1, 0.948 368, 23, 23, 0, 0.723 368, 23, 23, 1, 0.723 368, 2048, 0, 0, 1.058 368, 2048, 0, 1, 1.058 368, 2071, 0, 0, 1.0 368, 2071, 0, 1, 1.0 368, 2048, 23, 0, 0.948 368, 2048, 23, 1, 0.948 368, 2071, 23, 0, 0.701 368, 2071, 23, 1, 0.701 384, 0, 0, 0, 1.012 384, 0, 0, 1, 1.012 384, 24, 0, 0, 1.04 384, 24, 0, 1, 1.04 384, 0, 24, 0, 1.154 384, 0, 24, 1, 1.154 384, 24, 24, 0, 1.423 384, 24, 24, 1, 1.423 384, 2048, 0, 0, 1.012 384, 2048, 0, 1, 1.012 384, 2072, 0, 0, 1.04 384, 2072, 0, 1, 1.04 384, 2048, 24, 0, 0.91 384, 2048, 24, 1, 0.91 384, 2072, 24, 0, 1.423 384, 2072, 24, 1, 1.423 400, 0, 0, 0, 0.948 400, 0, 0, 1, 0.948 400, 25, 0, 0, 0.957 400, 25, 0, 1, 0.957 400, 0, 25, 0, 1.054 400, 0, 25, 1, 1.097 400, 25, 25, 0, 0.885 400, 25, 25, 1, 0.885 400, 2048, 0, 0, 0.948 400, 2048, 0, 1, 0.948 400, 2073, 0, 0, 0.957 400, 2073, 0, 1, 0.957 400, 2048, 25, 0, 0.94 400, 2048, 25, 1, 0.94 400, 2073, 25, 0, 0.908 400, 2073, 25, 1, 0.908 416, 0, 0, 0, 1.017 416, 0, 0, 1, 1.017 416, 26, 0, 0, 0.903 416, 26, 0, 1, 0.903 416, 0, 26, 0, 0.881 416, 0, 26, 1, 0.881 416, 26, 26, 0, 1.035 416, 26, 26, 1, 1.035 416, 2048, 0, 0, 1.017 416, 2048, 0, 1, 1.017 416, 2074, 0, 0, 0.903 416, 2074, 0, 1, 0.903 416, 2048, 26, 0, 0.881 416, 2048, 26, 1, 0.881 416, 2074, 26, 0, 1.035 416, 2074, 26, 1, 1.035 432, 0, 0, 0, 1.0 432, 0, 0, 1, 1.0 432, 27, 0, 0, 0.933 432, 27, 0, 1, 0.933 432, 0, 27, 0, 0.941 432, 0, 27, 1, 0.941 432, 27, 27, 0, 0.953 432, 27, 27, 1, 0.954 432, 2048, 0, 0, 1.0 432, 2048, 0, 1, 1.0 432, 2075, 0, 0, 0.933 432, 2075, 0, 1, 0.933 432, 2048, 27, 0, 0.941 432, 2048, 27, 1, 0.941 432, 2075, 27, 0, 0.93 432, 2075, 27, 1, 0.93 448, 0, 0, 0, 0.984 448, 0, 0, 1, 0.984 448, 28, 0, 0, 0.896 448, 28, 0, 1, 0.896 448, 0, 28, 0, 1.244 448, 0, 28, 1, 1.244 448, 28, 28, 0, 1.333 448, 28, 28, 1, 1.333 448, 2048, 0, 0, 0.984 448, 2048, 0, 1, 0.984 448, 2076, 0, 0, 0.896 448, 2076, 0, 1, 0.896 448, 2048, 28, 0, 0.988 448, 2048, 28, 1, 0.988 448, 2076, 28, 0, 1.333 448, 2076, 28, 1, 1.333 464, 0, 0, 0, 1.083 464, 0, 0, 1, 1.083 464, 29, 0, 0, 0.978 464, 29, 0, 1, 0.978 464, 0, 29, 0, 0.924 464, 0, 29, 1, 0.924 464, 29, 29, 0, 0.901 464, 29, 29, 1, 0.901 464, 2048, 0, 0, 1.083 464, 2048, 0, 1, 1.083 464, 2077, 0, 0, 0.978 464, 2077, 0, 1, 0.978 464, 2048, 29, 0, 0.924 464, 2048, 29, 1, 0.924 464, 2077, 29, 0, 0.89 464, 2077, 29, 1, 0.89 480, 0, 0, 0, 1.066 480, 0, 0, 1, 1.066 480, 30, 0, 0, 0.9 480, 30, 0, 1, 0.9 480, 0, 30, 0, 0.88 480, 0, 30, 1, 0.88 480, 30, 30, 0, 1.083 480, 30, 30, 1, 1.083 480, 2048, 0, 0, 1.066 480, 2048, 0, 1, 1.066 480, 2078, 0, 0, 0.9 480, 2078, 0, 1, 0.9 480, 2048, 30, 0, 0.88 480, 2048, 30, 1, 0.88 480, 2078, 30, 0, 1.083 480, 2078, 30, 1, 1.083 496, 0, 0, 0, 1.032 496, 0, 0, 1, 1.032 496, 31, 0, 0, 0.95 496, 31, 0, 1, 0.95 496, 0, 31, 0, 1.011 496, 0, 31, 1, 1.011 496, 31, 31, 0, 0.973 496, 31, 31, 1, 0.973 496, 2048, 0, 0, 1.032 496, 2048, 0, 1, 1.032 496, 2079, 0, 0, 0.95 496, 2079, 0, 1, 0.95 496, 2048, 31, 0, 1.011 496, 2048, 31, 1, 1.011 496, 2079, 31, 0, 0.941 496, 2079, 31, 1, 0.941 1024, 32, 0, 0, 1.143 1024, 32, 0, 1, 1.143 1024, 0, 32, 0, 1.143 1024, 0, 32, 1, 1.143 1024, 32, 32, 0, 1.143 1024, 32, 32, 1, 1.143 1024, 2080, 0, 0, 1.143 1024, 2080, 0, 1, 1.143 1024, 2048, 32, 0, 1.143 1024, 2048, 32, 1, 1.143 1024, 2080, 32, 0, 1.143 1024, 2080, 32, 1, 1.143 1056, 0, 0, 0, 1.165 1056, 0, 0, 1, 1.162 1056, 33, 0, 0, 1.067 1056, 33, 0, 1, 1.067 1056, 0, 33, 0, 0.977 1056, 0, 33, 1, 0.977 1056, 33, 33, 0, 1.043 1056, 33, 33, 1, 1.043 1056, 2048, 0, 0, 1.168 1056, 2048, 0, 1, 1.168 1056, 2081, 0, 0, 1.067 1056, 2081, 0, 1, 1.067 1056, 2048, 33, 0, 0.977 1056, 2048, 33, 1, 0.977 1056, 2081, 33, 0, 1.0 1056, 2081, 33, 1, 1.0 1088, 0, 0, 0, 1.171 1088, 0, 0, 1, 1.171 1088, 34, 0, 0, 1.041 1088, 34, 0, 1, 1.041 1088, 0, 34, 0, 1.079 1088, 0, 34, 1, 1.079 1088, 34, 34, 0, 0.966 1088, 34, 34, 1, 0.966 1088, 2048, 0, 0, 1.171 1088, 2048, 0, 1, 1.171 1088, 2082, 0, 0, 1.041 1088, 2082, 0, 1, 1.041 1088, 2048, 34, 0, 0.994 1088, 2048, 34, 1, 0.994 1088, 2082, 34, 0, 0.966 1088, 2082, 34, 1, 0.966 1120, 0, 0, 0, 1.154 1120, 0, 0, 1, 1.151 1120, 35, 0, 0, 1.051 1120, 35, 0, 1, 1.051 1120, 0, 35, 0, 1.0 1120, 0, 35, 1, 1.0 1120, 35, 35, 0, 1.068 1120, 35, 35, 1, 1.068 1120, 2048, 0, 0, 1.151 1120, 2048, 0, 1, 1.151 1120, 2083, 0, 0, 1.051 1120, 2083, 0, 1, 1.051 1120, 2048, 35, 0, 1.0 1120, 2048, 35, 1, 1.0 1120, 2083, 35, 0, 1.027 1120, 2083, 35, 1, 1.027 1152, 0, 0, 0, 1.159 1152, 0, 0, 1, 1.159 1152, 36, 0, 0, 1.034 1152, 36, 0, 1, 1.034 1152, 0, 36, 0, 1.07 1152, 0, 36, 1, 1.07 1152, 36, 36, 0, 0.967 1152, 36, 36, 1, 0.967 1152, 2048, 0, 0, 1.159 1152, 2048, 0, 1, 1.159 1152, 2084, 0, 0, 1.034 1152, 2084, 0, 1, 1.034 1152, 2048, 36, 0, 0.984 1152, 2048, 36, 1, 0.984 1152, 2084, 36, 0, 0.967 1152, 2084, 36, 1, 0.967 1184, 0, 0, 0, 1.157 1184, 0, 0, 1, 1.157 1184, 37, 0, 0, 1.066 1184, 37, 0, 1, 1.066 1184, 0, 37, 0, 0.993 1184, 0, 37, 1, 0.993 1184, 37, 37, 0, 1.08 1184, 37, 37, 1, 1.081 1184, 2048, 0, 0, 1.157 1184, 2048, 0, 1, 1.157 1184, 2085, 0, 0, 1.066 1184, 2085, 0, 1, 1.066 1184, 2048, 37, 0, 0.993 1184, 2048, 37, 1, 0.993 1184, 2085, 37, 0, 1.04 1184, 2085, 37, 1, 1.04 1216, 0, 0, 0, 1.139 1216, 0, 0, 1, 1.139 1216, 38, 0, 0, 1.024 1216, 38, 0, 1, 1.024 1216, 0, 38, 0, 1.086 1216, 0, 38, 1, 1.087 1216, 38, 38, 0, 1.0 1216, 38, 38, 1, 1.0 1216, 2048, 0, 0, 1.138 1216, 2048, 0, 1, 1.138 1216, 2086, 0, 0, 1.024 1216, 2086, 0, 1, 1.024 1216, 2048, 38, 0, 1.01 1216, 2048, 38, 1, 1.01 1216, 2086, 38, 0, 1.0 1216, 2086, 38, 1, 1.0 1248, 0, 0, 0, 1.175 1248, 0, 0, 1, 1.174 1248, 39, 0, 0, 1.074 1248, 39, 0, 1, 1.074 1248, 0, 39, 0, 0.975 1248, 0, 39, 1, 0.985 1248, 39, 39, 0, 1.064 1248, 39, 39, 1, 1.064 1248, 2048, 0, 0, 1.179 1248, 2048, 0, 1, 1.178 1248, 2087, 0, 0, 1.074 1248, 2087, 0, 1, 1.074 1248, 2048, 39, 0, 0.985 1248, 2048, 39, 1, 0.985 1248, 2087, 39, 0, 1.026 1248, 2087, 39, 1, 1.026 1280, 0, 0, 0, 0.992 1280, 0, 0, 1, 0.992 1280, 40, 0, 0, 1.051 1280, 40, 0, 1, 1.051 1280, 0, 40, 0, 1.044 1280, 0, 40, 1, 1.044 1280, 40, 40, 0, 1.252 1280, 40, 40, 1, 1.252 1280, 2048, 0, 0, 0.992 1280, 2048, 0, 1, 0.992 1280, 2088, 0, 0, 1.051 1280, 2088, 0, 1, 1.051 1280, 2048, 40, 0, 0.946 1280, 2048, 40, 1, 0.946 1280, 2088, 40, 0, 1.252 1280, 2088, 40, 1, 1.252 1312, 0, 0, 0, 0.969 1312, 0, 0, 1, 0.969 1312, 41, 0, 0, 0.988 1312, 41, 0, 1, 0.988 1312, 0, 41, 0, 0.837 1312, 0, 41, 1, 0.837 1312, 41, 41, 0, 1.025 1312, 41, 41, 1, 1.025 1312, 2048, 0, 0, 0.969 1312, 2048, 0, 1, 0.969 1312, 2089, 0, 0, 0.988 1312, 2089, 0, 1, 0.987 1312, 2048, 41, 0, 0.837 1312, 2048, 41, 1, 0.837 1312, 2089, 41, 0, 0.975 1312, 2089, 41, 1, 0.975 1344, 0, 0, 0, 0.987 1344, 0, 0, 1, 0.988 1344, 42, 0, 0, 1.031 1344, 42, 0, 1, 1.031 1344, 0, 42, 0, 1.033 1344, 0, 42, 1, 1.033 1344, 42, 42, 0, 0.982 1344, 42, 42, 1, 0.982 1344, 2048, 0, 0, 0.992 1344, 2048, 0, 1, 0.992 1344, 2090, 0, 0, 1.031 1344, 2090, 0, 1, 1.031 1344, 2048, 42, 0, 0.943 1344, 2048, 42, 1, 0.943 1344, 2090, 42, 0, 0.982 1344, 2090, 42, 1, 0.982 1376, 0, 0, 0, 1.016 1376, 0, 0, 1, 1.016 1376, 43, 0, 0, 1.005 1376, 43, 0, 1, 1.005 1376, 0, 43, 0, 0.829 1376, 0, 43, 1, 0.829 1376, 43, 43, 0, 1.024 1376, 43, 43, 1, 1.024 1376, 2048, 0, 0, 1.005 1376, 2048, 0, 1, 1.013 1376, 2091, 0, 0, 1.005 1376, 2091, 0, 1, 1.005 1376, 2048, 43, 0, 0.829 1376, 2048, 43, 1, 0.829 1376, 2091, 43, 0, 0.98 1376, 2091, 43, 1, 0.98 1408, 0, 0, 0, 0.988 1408, 0, 0, 1, 0.988 1408, 44, 0, 0, 1.015 1408, 44, 0, 1, 1.015 1408, 0, 44, 0, 1.023 1408, 0, 44, 1, 1.03 1408, 44, 44, 0, 0.998 1408, 44, 44, 1, 0.994 1408, 2048, 0, 0, 0.988 1408, 2048, 0, 1, 0.988 1408, 2092, 0, 0, 1.015 1408, 2092, 0, 1, 1.015 1408, 2048, 44, 0, 0.955 1408, 2048, 44, 1, 0.955 1408, 2092, 44, 0, 0.999 1408, 2092, 44, 1, 0.994 1440, 0, 0, 0, 0.986 1440, 0, 0, 1, 0.986 1440, 45, 0, 0, 1.008 1440, 45, 0, 1, 1.008 1440, 0, 45, 0, 0.814 1440, 0, 45, 1, 0.814 1440, 45, 45, 0, 1.006 1440, 45, 45, 1, 1.006 1440, 2048, 0, 0, 0.986 1440, 2048, 0, 1, 0.986 1440, 2093, 0, 0, 1.008 1440, 2093, 0, 1, 1.008 1440, 2048, 45, 0, 0.814 1440, 2048, 45, 1, 0.814 1440, 2093, 45, 0, 0.966 1440, 2093, 45, 1, 0.966 1472, 0, 0, 0, 0.993 1472, 0, 0, 1, 0.992 1472, 46, 0, 0, 1.045 1472, 46, 0, 1, 1.045 1472, 0, 46, 0, 1.026 1472, 0, 46, 1, 1.026 1472, 46, 46, 0, 0.966 1472, 46, 46, 1, 0.966 1472, 2048, 0, 0, 0.999 1472, 2048, 0, 1, 0.997 1472, 2094, 0, 0, 1.045 1472, 2094, 0, 1, 1.045 1472, 2048, 46, 0, 0.939 1472, 2048, 46, 1, 0.939 1472, 2094, 46, 0, 0.966 1472, 2094, 46, 1, 0.966 1504, 0, 0, 0, 0.991 1504, 0, 0, 1, 0.991 1504, 47, 0, 0, 0.999 1504, 47, 0, 1, 0.999 1504, 0, 47, 0, 0.826 1504, 0, 47, 1, 0.826 1504, 47, 47, 0, 1.023 1504, 47, 47, 1, 1.023 1504, 2048, 0, 0, 0.993 1504, 2048, 0, 1, 0.993 1504, 2095, 0, 0, 0.999 1504, 2095, 0, 1, 0.999 1504, 2048, 47, 0, 0.826 1504, 2048, 47, 1, 0.826 1504, 2095, 47, 0, 0.993 1504, 2095, 47, 1, 0.993 1536, 0, 0, 0, 0.994 1536, 0, 0, 1, 0.993 1536, 48, 0, 0, 1.019 1536, 48, 0, 1, 1.019 1536, 0, 48, 0, 1.025 1536, 0, 48, 1, 1.025 1536, 48, 48, 0, 0.993 1536, 48, 48, 1, 0.993 1536, 2048, 0, 0, 0.994 1536, 2048, 0, 1, 0.994 1536, 2096, 0, 0, 1.019 1536, 2096, 0, 1, 1.019 1536, 2048, 48, 0, 1.025 1536, 2048, 48, 1, 1.025 1536, 2096, 48, 0, 0.994 1536, 2096, 48, 1, 0.994 1568, 0, 0, 0, 0.994 1568, 0, 0, 1, 0.994 1568, 49, 0, 0, 0.903 1568, 49, 0, 1, 0.903 1568, 0, 49, 0, 1.147 1568, 0, 49, 1, 1.147 1568, 49, 49, 0, 1.461 1568, 49, 49, 1, 1.46 1568, 2048, 0, 0, 0.994 1568, 2048, 0, 1, 0.993 1568, 2097, 0, 0, 0.903 1568, 2097, 0, 1, 0.903 1568, 2048, 49, 0, 1.09 1568, 2048, 49, 1, 1.09 1568, 2097, 49, 0, 1.46 1568, 2097, 49, 1, 1.46 1600, 0, 0, 0, 0.981 1600, 0, 0, 1, 0.981 1600, 50, 0, 0, 1.022 1600, 50, 0, 1, 1.022 1600, 0, 50, 0, 1.017 1600, 0, 50, 1, 1.017 1600, 50, 50, 0, 0.973 1600, 50, 50, 1, 0.973 1600, 2048, 0, 0, 0.981 1600, 2048, 0, 1, 0.981 1600, 2098, 0, 0, 1.022 1600, 2098, 0, 1, 1.022 1600, 2048, 50, 0, 0.961 1600, 2048, 50, 1, 0.961 1600, 2098, 50, 0, 0.973 1600, 2098, 50, 1, 0.973 1632, 0, 0, 0, 1.018 1632, 0, 0, 1, 1.018 1632, 51, 0, 0, 0.893 1632, 51, 0, 1, 0.893 1632, 0, 51, 0, 1.134 1632, 0, 51, 1, 1.134 1632, 51, 51, 0, 1.444 1632, 51, 51, 1, 1.444 1632, 2048, 0, 0, 1.019 1632, 2048, 0, 1, 1.019 1632, 2099, 0, 0, 0.893 1632, 2099, 0, 1, 0.893 1632, 2048, 51, 0, 1.079 1632, 2048, 51, 1, 1.079 1632, 2099, 51, 0, 1.449 1632, 2099, 51, 1, 1.449 1664, 0, 0, 0, 1.006 1664, 0, 0, 1, 1.006 1664, 52, 0, 0, 0.982 1664, 52, 0, 1, 0.986 1664, 0, 52, 0, 1.004 1664, 0, 52, 1, 1.004 1664, 52, 52, 0, 0.976 1664, 52, 52, 1, 0.976 1664, 2048, 0, 0, 1.006 1664, 2048, 0, 1, 1.006 1664, 2100, 0, 0, 0.983 1664, 2100, 0, 1, 0.983 1664, 2048, 52, 0, 0.946 1664, 2048, 52, 1, 0.946 1664, 2100, 52, 0, 0.976 1664, 2100, 52, 1, 0.976 1696, 0, 0, 0, 0.99 1696, 0, 0, 1, 0.99 1696, 53, 0, 0, 0.884 1696, 53, 0, 1, 0.884 1696, 0, 53, 0, 1.141 1696, 0, 53, 1, 1.141 1696, 53, 53, 0, 1.43 1696, 53, 53, 1, 1.428 1696, 2048, 0, 0, 0.994 1696, 2048, 0, 1, 0.993 1696, 2101, 0, 0, 0.884 1696, 2101, 0, 1, 0.884 1696, 2048, 53, 0, 1.088 1696, 2048, 53, 1, 1.088 1696, 2101, 53, 0, 1.429 1696, 2101, 53, 1, 1.429 1728, 0, 0, 0, 0.978 1728, 0, 0, 1, 0.977 1728, 54, 0, 0, 1.032 1728, 54, 0, 1, 1.033 1728, 0, 54, 0, 1.0 1728, 0, 54, 1, 1.0 1728, 54, 54, 0, 0.96 1728, 54, 54, 1, 0.96 1728, 2048, 0, 0, 0.976 1728, 2048, 0, 1, 0.976 1728, 2102, 0, 0, 1.033 1728, 2102, 0, 1, 1.033 1728, 2048, 54, 0, 0.947 1728, 2048, 54, 1, 0.947 1728, 2102, 54, 0, 0.96 1728, 2102, 54, 1, 0.96 1760, 0, 0, 0, 1.019 1760, 0, 0, 1, 1.022 1760, 55, 0, 0, 0.9 1760, 55, 0, 1, 0.9 1760, 0, 55, 0, 1.125 1760, 0, 55, 1, 1.125 1760, 55, 55, 0, 1.438 1760, 55, 55, 1, 1.439 1760, 2048, 0, 0, 1.015 1760, 2048, 0, 1, 1.015 1760, 2103, 0, 0, 0.9 1760, 2103, 0, 1, 0.9 1760, 2048, 55, 0, 1.073 1760, 2048, 55, 1, 1.074 1760, 2103, 55, 0, 1.435 1760, 2103, 55, 1, 1.44 1792, 0, 0, 0, 1.003 1792, 0, 0, 1, 1.002 1792, 56, 0, 0, 1.028 1792, 56, 0, 1, 1.028 1792, 0, 56, 0, 1.014 1792, 0, 56, 1, 1.015 1792, 56, 56, 0, 1.191 1792, 56, 56, 1, 1.191 1792, 2048, 0, 0, 1.003 1792, 2048, 0, 1, 1.003 1792, 2104, 0, 0, 1.028 1792, 2104, 0, 1, 1.028 1792, 2048, 56, 0, 0.963 1792, 2048, 56, 1, 0.963 1792, 2104, 56, 0, 1.191 1792, 2104, 56, 1, 1.191 1824, 0, 0, 0, 1.001 1824, 0, 0, 1, 1.001 1824, 57, 0, 0, 0.891 1824, 57, 0, 1, 0.891 1824, 0, 57, 0, 1.114 1824, 0, 57, 1, 1.114 1824, 57, 57, 0, 1.407 1824, 57, 57, 1, 1.407 1824, 2048, 0, 0, 1.001 1824, 2048, 0, 1, 1.001 1824, 2105, 0, 0, 0.891 1824, 2105, 0, 1, 0.891 1824, 2048, 57, 0, 1.064 1824, 2048, 57, 1, 1.064 1824, 2105, 57, 0, 1.407 1824, 2105, 57, 1, 1.407 1856, 0, 0, 0, 0.991 1856, 0, 0, 1, 0.991 1856, 58, 0, 0, 1.042 1856, 58, 0, 1, 1.042 1856, 0, 58, 0, 1.007 1856, 0, 58, 1, 1.007 1856, 58, 58, 0, 0.98 1856, 58, 58, 1, 0.972 1856, 2048, 0, 0, 0.992 1856, 2048, 0, 1, 0.992 1856, 2106, 0, 0, 1.042 1856, 2106, 0, 1, 1.042 1856, 2048, 58, 0, 0.954 1856, 2048, 58, 1, 0.954 1856, 2106, 58, 0, 0.98 1856, 2106, 58, 1, 0.972 1888, 0, 0, 0, 0.993 1888, 0, 0, 1, 0.992 1888, 59, 0, 0, 0.883 1888, 59, 0, 1, 0.883 1888, 0, 59, 0, 1.124 1888, 0, 59, 1, 1.125 1888, 59, 59, 0, 1.413 1888, 59, 59, 1, 1.413 1888, 2048, 0, 0, 0.986 1888, 2048, 0, 1, 0.991 1888, 2107, 0, 0, 0.883 1888, 2107, 0, 1, 0.883 1888, 2048, 59, 0, 1.076 1888, 2048, 59, 1, 1.076 1888, 2107, 59, 0, 1.413 1888, 2107, 59, 1, 1.413 1920, 0, 0, 0, 1.0 1920, 0, 0, 1, 1.0 1920, 60, 0, 0, 1.033 1920, 60, 0, 1, 1.034 1920, 0, 60, 0, 0.996 1920, 0, 60, 1, 0.997 1920, 60, 60, 0, 0.968 1920, 60, 60, 1, 0.968 1920, 2048, 0, 0, 1.0 1920, 2048, 0, 1, 1.0 1920, 2108, 0, 0, 1.034 1920, 2108, 0, 1, 1.034 1920, 2048, 60, 0, 0.949 1920, 2048, 60, 1, 0.949 1920, 2108, 60, 0, 0.968 1920, 2108, 60, 1, 0.968 1952, 0, 0, 0, 1.004 1952, 0, 0, 1, 1.004 1952, 61, 0, 0, 0.897 1952, 61, 0, 1, 0.898 1952, 0, 61, 0, 1.118 1952, 0, 61, 1, 1.118 1952, 61, 61, 0, 1.387 1952, 61, 61, 1, 1.387 1952, 2048, 0, 0, 1.004 1952, 2048, 0, 1, 1.004 1952, 2109, 0, 0, 0.898 1952, 2109, 0, 1, 0.898 1952, 2048, 61, 0, 1.071 1952, 2048, 61, 1, 1.071 1952, 2109, 61, 0, 1.387 1952, 2109, 61, 1, 1.387 1984, 0, 0, 0, 0.993 1984, 0, 0, 1, 0.993 1984, 62, 0, 0, 1.025 1984, 62, 0, 1, 1.025 1984, 0, 62, 0, 1.005 1984, 0, 62, 1, 1.007 1984, 62, 62, 0, 0.982 1984, 62, 62, 1, 0.982 1984, 2048, 0, 0, 0.993 1984, 2048, 0, 1, 0.993 1984, 2110, 0, 0, 1.025 1984, 2110, 0, 1, 1.025 1984, 2048, 62, 0, 0.96 1984, 2048, 62, 1, 0.96 1984, 2110, 62, 0, 0.982 1984, 2110, 62, 1, 0.982 2016, 0, 0, 0, 0.999 2016, 0, 0, 1, 0.999 2016, 63, 0, 0, 0.889 2016, 63, 0, 1, 0.89 2016, 0, 63, 0, 1.093 2016, 0, 63, 1, 1.094 2016, 63, 63, 0, 1.362 2016, 63, 63, 1, 1.363 2016, 2048, 0, 0, 1.0 2016, 2048, 0, 1, 1.0 2016, 2111, 0, 0, 0.965 2016, 2111, 0, 1, 0.965 2016, 2048, 63, 0, 1.049 2016, 2048, 63, 1, 1.049 2016, 2111, 63, 0, 1.405 2016, 2111, 63, 1, 1.405 2048, 32, 0, 0, 1.01 2048, 32, 0, 1, 1.01 2048, 0, 32, 0, 1.005 2048, 0, 32, 1, 1.005 2048, 32, 32, 0, 1.005 2048, 32, 32, 1, 1.005 2048, 0, 1, 0, 0.983 2048, 0, 1, 1, 0.984 2048, 1, 0, 0, 1.039 2048, 1, 0, 1, 1.039 2048, 32, 1, 0, 1.063 2048, 32, 1, 1, 1.063 2048, 1, 32, 0, 0.94 2048, 1, 32, 1, 0.94 2048, 2048, 1, 0, 0.981 2048, 2048, 1, 1, 0.981 2048, 2049, 0, 0, 0.904 2048, 2049, 0, 1, 0.904 2112, 0, 0, 0, 0.996 2112, 0, 0, 1, 0.996 2112, 1, 0, 0, 1.031 2112, 1, 0, 1, 1.031 2112, 33, 0, 0, 1.01 2112, 33, 0, 1, 1.01 2112, 0, 1, 0, 0.972 2112, 0, 1, 1, 0.972 2112, 0, 33, 0, 0.988 2112, 0, 33, 1, 0.988 2112, 1, 1, 0, 0.914 2112, 1, 1, 1, 0.914 2112, 33, 33, 0, 0.983 2112, 33, 33, 1, 0.983 2112, 2048, 0, 0, 0.993 2112, 2048, 0, 1, 0.991 2112, 2049, 0, 0, 1.031 2112, 2049, 0, 1, 1.031 2112, 2048, 1, 0, 0.955 2112, 2048, 1, 1, 0.955 2112, 2049, 1, 0, 0.906 2112, 2049, 1, 1, 0.906 2112, 33, 1, 0, 1.163 2112, 33, 1, 1, 1.164 2112, 1, 33, 0, 1.046 2112, 1, 33, 1, 1.046 2176, 0, 0, 0, 0.985 2176, 0, 0, 1, 0.985 2176, 2, 0, 0, 1.023 2176, 2, 0, 1, 1.023 2176, 34, 0, 0, 1.0 2176, 34, 0, 1, 1.0 2176, 0, 2, 0, 0.984 2176, 0, 2, 1, 0.985 2176, 0, 34, 0, 0.986 2176, 0, 34, 1, 0.993 2176, 2, 2, 0, 0.928 2176, 2, 2, 1, 0.928 2176, 34, 34, 0, 1.004 2176, 34, 34, 1, 1.004 2176, 2048, 0, 0, 0.985 2176, 2048, 0, 1, 0.985 2176, 2050, 0, 0, 1.023 2176, 2050, 0, 1, 1.023 2176, 2048, 2, 0, 0.802 2176, 2048, 2, 1, 0.802 2176, 2050, 2, 0, 0.894 2176, 2050, 2, 1, 0.894 2176, 2, 1, 0, 1.068 2176, 2, 1, 1, 1.068 2176, 1, 2, 0, 0.976 2176, 1, 2, 1, 0.976 2176, 34, 1, 0, 1.077 2176, 34, 1, 1, 1.077 2176, 1, 34, 0, 0.978 2176, 1, 34, 1, 0.978 2176, 2050, 1, 0, 1.061 2176, 2050, 1, 1, 1.061 2176, 2049, 2, 0, 0.971 2176, 2049, 2, 1, 0.971 2240, 0, 0, 0, 0.994 2240, 0, 0, 1, 0.994 2240, 3, 0, 0, 1.038 2240, 3, 0, 1, 1.039 2240, 35, 0, 0, 1.019 2240, 35, 0, 1, 1.019 2240, 0, 3, 0, 0.979 2240, 0, 3, 1, 0.98 2240, 0, 35, 0, 0.991 2240, 0, 35, 1, 0.991 2240, 3, 3, 0, 0.931 2240, 3, 3, 1, 0.931 2240, 35, 35, 0, 0.999 2240, 35, 35, 1, 0.999 2240, 2048, 0, 0, 0.995 2240, 2048, 0, 1, 0.995 2240, 2051, 0, 0, 1.039 2240, 2051, 0, 1, 1.039 2240, 2048, 3, 0, 0.799 2240, 2048, 3, 1, 0.799 2240, 2051, 3, 0, 0.889 2240, 2051, 3, 1, 0.889 2240, 3, 1, 0, 1.06 2240, 3, 1, 1, 1.06 2240, 1, 3, 0, 0.968 2240, 1, 3, 1, 0.968 2240, 35, 1, 0, 1.071 2240, 35, 1, 1, 1.071 2240, 1, 35, 0, 0.971 2240, 1, 35, 1, 0.971 2240, 2051, 1, 0, 1.057 2240, 2051, 1, 1, 1.057 2240, 2049, 3, 0, 0.966 2240, 2049, 3, 1, 0.966 2304, 0, 0, 0, 0.988 2304, 0, 0, 1, 0.988 2304, 4, 0, 0, 1.031 2304, 4, 0, 1, 1.032 2304, 36, 0, 0, 1.011 2304, 36, 0, 1, 1.011 2304, 0, 4, 0, 0.968 2304, 0, 4, 1, 0.967 2304, 0, 36, 0, 0.988 2304, 0, 36, 1, 0.988 2304, 4, 4, 0, 0.931 2304, 4, 4, 1, 0.931 2304, 36, 36, 0, 0.992 2304, 36, 36, 1, 0.992 2304, 2048, 0, 0, 0.988 2304, 2048, 0, 1, 0.988 2304, 2052, 0, 0, 1.032 2304, 2052, 0, 1, 1.032 2304, 2048, 4, 0, 0.793 2304, 2048, 4, 1, 0.793 2304, 2052, 4, 0, 0.884 2304, 2052, 4, 1, 0.884 2304, 4, 1, 0, 0.989 2304, 4, 1, 1, 0.989 2304, 1, 4, 0, 0.897 2304, 1, 4, 1, 0.898 2304, 36, 1, 0, 1.057 2304, 36, 1, 1, 1.057 2304, 1, 36, 0, 0.966 2304, 1, 36, 1, 0.966 2304, 2052, 1, 0, 1.052 2304, 2052, 1, 1, 1.052 2304, 2049, 4, 0, 0.955 2304, 2049, 4, 1, 0.955 2368, 0, 0, 0, 0.999 2368, 0, 0, 1, 1.0 2368, 5, 0, 0, 1.024 2368, 5, 0, 1, 1.025 2368, 37, 0, 0, 1.0 2368, 37, 0, 1, 1.0 2368, 0, 5, 0, 0.98 2368, 0, 5, 1, 0.981 2368, 0, 37, 0, 0.986 2368, 0, 37, 1, 0.981 2368, 5, 5, 0, 0.944 2368, 5, 5, 1, 0.944 2368, 37, 37, 0, 1.003 2368, 37, 37, 1, 1.003 2368, 2048, 0, 0, 1.002 2368, 2048, 0, 1, 1.002 2368, 2053, 0, 0, 1.025 2368, 2053, 0, 1, 1.025 2368, 2048, 5, 0, 0.801 2368, 2048, 5, 1, 0.801 2368, 2053, 5, 0, 0.907 2368, 2053, 5, 1, 0.907 2368, 5, 1, 0, 1.071 2368, 5, 1, 1, 1.071 2368, 1, 5, 0, 0.973 2368, 1, 5, 1, 0.973 2368, 37, 1, 0, 1.07 2368, 37, 1, 1, 1.07 2368, 1, 37, 0, 0.974 2368, 1, 37, 1, 0.974 2368, 2053, 1, 0, 1.065 2368, 2053, 1, 1, 1.065 2368, 2049, 5, 0, 0.967 2368, 2049, 5, 1, 0.967 2432, 0, 0, 0, 0.968 2432, 0, 0, 1, 1.002 2432, 6, 0, 0, 1.032 2432, 6, 0, 1, 1.033 2432, 38, 0, 0, 1.021 2432, 38, 0, 1, 1.021 2432, 0, 6, 0, 0.973 2432, 0, 6, 1, 0.976 2432, 0, 38, 0, 0.986 2432, 0, 38, 1, 0.986 2432, 6, 6, 0, 0.926 2432, 6, 6, 1, 0.926 2432, 38, 38, 0, 1.0 2432, 38, 38, 1, 1.0 2432, 2048, 0, 0, 1.005 2432, 2048, 0, 1, 1.004 2432, 2054, 0, 0, 1.032 2432, 2054, 0, 1, 1.033 2432, 2048, 6, 0, 0.797 2432, 2048, 6, 1, 0.797 2432, 2054, 6, 0, 0.898 2432, 2054, 6, 1, 0.898 2432, 6, 1, 0, 1.058 2432, 6, 1, 1, 1.058 2432, 1, 6, 0, 0.96 2432, 1, 6, 1, 0.96 2432, 38, 1, 0, 1.062 2432, 38, 1, 1, 1.062 2432, 1, 38, 0, 0.963 2432, 1, 38, 1, 0.963 2432, 2054, 1, 0, 1.054 2432, 2054, 1, 1, 1.054 2432, 2049, 6, 0, 0.957 2432, 2049, 6, 1, 0.957 2496, 0, 0, 0, 1.013 2496, 0, 0, 1, 1.013 2496, 7, 0, 0, 1.025 2496, 7, 0, 1, 1.026 2496, 39, 0, 0, 1.013 2496, 39, 0, 1, 1.013 2496, 0, 7, 0, 0.964 2496, 0, 7, 1, 0.966 2496, 0, 39, 0, 0.979 2496, 0, 39, 1, 0.979 2496, 7, 7, 0, 0.925 2496, 7, 7, 1, 0.925 2496, 39, 39, 0, 0.989 2496, 39, 39, 1, 0.989 2496, 2048, 0, 0, 1.013 2496, 2048, 0, 1, 1.013 2496, 2055, 0, 0, 1.026 2496, 2055, 0, 1, 1.026 2496, 2048, 7, 0, 0.792 2496, 2048, 7, 1, 0.792 2496, 2055, 7, 0, 0.93 2496, 2055, 7, 1, 0.93 2496, 7, 1, 0, 0.982 2496, 7, 1, 1, 0.982 2496, 1, 7, 0, 0.893 2496, 1, 7, 1, 0.893 2496, 39, 1, 0, 1.048 2496, 39, 1, 1, 1.049 2496, 1, 39, 0, 0.958 2496, 1, 39, 1, 0.958 2496, 2055, 1, 0, 1.042 2496, 2055, 1, 1, 1.042 2496, 2049, 7, 0, 0.947 2496, 2049, 7, 1, 0.947 2560, 0, 0, 0, 0.993 2560, 0, 0, 1, 0.993 2560, 8, 0, 0, 1.031 2560, 8, 0, 1, 1.032 2560, 40, 0, 0, 1.029 2560, 40, 0, 1, 1.029 2560, 0, 8, 0, 0.992 2560, 0, 8, 1, 0.992 2560, 0, 40, 0, 0.981 2560, 0, 40, 1, 0.98 2560, 8, 8, 0, 0.943 2560, 8, 8, 1, 0.942 2560, 40, 40, 0, 1.141 2560, 40, 40, 1, 1.141 2560, 2048, 0, 0, 0.993 2560, 2048, 0, 1, 0.993 2560, 2056, 0, 0, 1.032 2560, 2056, 0, 1, 1.032 2560, 2048, 8, 0, 0.812 2560, 2048, 8, 1, 0.812 2560, 2056, 8, 0, 0.912 2560, 2056, 8, 1, 0.912 2560, 8, 1, 0, 1.069 2560, 8, 1, 1, 1.069 2560, 1, 8, 0, 0.974 2560, 1, 8, 1, 0.974 2560, 40, 1, 0, 1.068 2560, 40, 1, 1, 1.068 2560, 1, 40, 0, 0.996 2560, 1, 40, 1, 0.996 2560, 2056, 1, 0, 1.063 2560, 2056, 1, 1, 1.063 2560, 2049, 8, 0, 0.969 2560, 2049, 8, 1, 0.969 2624, 0, 0, 0, 0.997 2624, 0, 0, 1, 0.997 2624, 9, 0, 0, 1.008 2624, 9, 0, 1, 1.012 2624, 41, 0, 0, 1.044 2624, 41, 0, 1, 1.044 2624, 0, 9, 0, 0.988 2624, 0, 9, 1, 0.99 2624, 0, 41, 0, 0.99 2624, 0, 41, 1, 0.99 2624, 9, 9, 0, 0.943 2624, 9, 9, 1, 0.943 2624, 41, 41, 0, 0.993 2624, 41, 41, 1, 0.993 2624, 2048, 0, 0, 0.998 2624, 2048, 0, 1, 0.998 2624, 2057, 0, 0, 1.012 2624, 2057, 0, 1, 1.012 2624, 2048, 9, 0, 0.81 2624, 2048, 9, 1, 0.81 2624, 2057, 9, 0, 0.907 2624, 2057, 9, 1, 0.907 2624, 9, 1, 0, 1.085 2624, 9, 1, 1, 1.084 2624, 1, 9, 0, 0.962 2624, 1, 9, 1, 0.963 2624, 41, 1, 0, 1.078 2624, 41, 1, 1, 1.078 2624, 1, 41, 0, 0.962 2624, 1, 41, 1, 0.962 2624, 2057, 1, 0, 1.081 2624, 2057, 1, 1, 1.081 2624, 2049, 9, 0, 0.959 2624, 2049, 9, 1, 0.959 2688, 0, 0, 0, 0.995 2688, 0, 0, 1, 0.995 2688, 10, 0, 0, 1.003 2688, 10, 0, 1, 1.006 2688, 42, 0, 0, 1.036 2688, 42, 0, 1, 1.036 2688, 0, 10, 0, 0.978 2688, 0, 10, 1, 0.979 2688, 0, 42, 0, 0.978 2688, 0, 42, 1, 0.977 2688, 10, 10, 0, 0.942 2688, 10, 10, 1, 0.942 2688, 42, 42, 0, 0.989 2688, 42, 42, 1, 0.989 2688, 2048, 0, 0, 0.995 2688, 2048, 0, 1, 0.995 2688, 2058, 0, 0, 1.006 2688, 2058, 0, 1, 1.006 2688, 2048, 10, 0, 0.804 2688, 2048, 10, 1, 0.804 2688, 2058, 10, 0, 0.905 2688, 2058, 10, 1, 0.905 2688, 10, 1, 0, 0.985 2688, 10, 1, 1, 0.985 2688, 1, 10, 0, 0.892 2688, 1, 10, 1, 0.892 2688, 42, 1, 0, 1.048 2688, 42, 1, 1, 1.048 2688, 1, 42, 0, 0.958 2688, 1, 42, 1, 0.958 2688, 2058, 1, 0, 1.046 2688, 2058, 1, 1, 1.046 2688, 2049, 10, 0, 0.948 2688, 2049, 10, 1, 0.948 2752, 0, 0, 0, 0.998 2752, 0, 0, 1, 0.993 2752, 11, 0, 0, 0.96 2752, 11, 0, 1, 0.96 2752, 43, 0, 0, 0.979 2752, 43, 0, 1, 0.979 2752, 0, 11, 0, 0.939 2752, 0, 11, 1, 0.939 2752, 0, 43, 0, 0.93 2752, 0, 43, 1, 0.93 2752, 11, 11, 0, 0.949 2752, 11, 11, 1, 0.949 2752, 43, 43, 0, 1.007 2752, 43, 43, 1, 1.007 2752, 2048, 0, 0, 0.993 2752, 2048, 0, 1, 0.994 2752, 2059, 0, 0, 0.96 2752, 2059, 0, 1, 0.96 2752, 2048, 11, 0, 0.77 2752, 2048, 11, 1, 0.77 2752, 2059, 11, 0, 0.916 2752, 2059, 11, 1, 0.916 2752, 11, 1, 0, 1.0 2752, 11, 1, 1, 1.0 2752, 1, 11, 0, 0.933 2752, 1, 11, 1, 0.933 2752, 43, 1, 0, 1.028 2752, 43, 1, 1, 1.028 2752, 1, 43, 0, 0.925 2752, 1, 43, 1, 0.925 2752, 2059, 1, 0, 0.995 2752, 2059, 1, 1, 0.995 2752, 2049, 11, 0, 0.929 2752, 2049, 11, 1, 0.929 2816, 0, 0, 0, 1.004 2816, 0, 0, 1, 1.004 2816, 12, 0, 0, 0.897 2816, 12, 0, 1, 0.894 2816, 44, 0, 0, 0.914 2816, 44, 0, 1, 0.914 2816, 0, 12, 0, 0.877 2816, 0, 12, 1, 0.874 2816, 0, 44, 0, 0.871 2816, 0, 44, 1, 0.87 2816, 12, 12, 0, 0.948 2816, 12, 12, 1, 0.948 2816, 44, 44, 0, 1.009 2816, 44, 44, 1, 1.009 2816, 2048, 0, 0, 1.005 2816, 2048, 0, 1, 1.005 2816, 2060, 0, 0, 0.894 2816, 2060, 0, 1, 0.894 2816, 2048, 12, 0, 0.715 2816, 2048, 12, 1, 0.713 2816, 2060, 12, 0, 0.915 2816, 2060, 12, 1, 0.915 2816, 12, 1, 0, 0.918 2816, 12, 1, 1, 0.917 2816, 1, 12, 0, 0.863 2816, 1, 12, 1, 0.863 2816, 44, 1, 0, 0.944 2816, 44, 1, 1, 0.943 2816, 1, 44, 0, 0.861 2816, 1, 44, 1, 0.861 2816, 2060, 1, 0, 0.919 2816, 2060, 1, 1, 0.924 2816, 2049, 12, 0, 0.86 2816, 2049, 12, 1, 0.86 2880, 0, 0, 0, 0.989 2880, 0, 0, 1, 0.989 2880, 13, 0, 0, 0.967 2880, 13, 0, 1, 0.967 2880, 45, 0, 0, 0.987 2880, 45, 0, 1, 0.987 2880, 0, 13, 0, 0.925 2880, 0, 13, 1, 0.925 2880, 0, 45, 0, 0.927 2880, 0, 45, 1, 0.927 2880, 13, 13, 0, 0.944 2880, 13, 13, 1, 0.944 2880, 45, 45, 0, 1.003 2880, 45, 45, 1, 1.003 2880, 2048, 0, 0, 0.989 2880, 2048, 0, 1, 0.989 2880, 2061, 0, 0, 0.967 2880, 2061, 0, 1, 0.967 2880, 2048, 13, 0, 0.76 2880, 2048, 13, 1, 0.76 2880, 2061, 13, 0, 0.91 2880, 2061, 13, 1, 0.91 2880, 13, 1, 0, 0.922 2880, 13, 1, 1, 0.922 2880, 1, 13, 0, 0.859 2880, 1, 13, 1, 0.859 2880, 45, 1, 0, 1.013 2880, 45, 1, 1, 1.013 2880, 1, 45, 0, 0.92 2880, 1, 45, 1, 0.92 2880, 2061, 1, 0, 0.984 2880, 2061, 1, 1, 0.984 2880, 2049, 13, 0, 0.918 2880, 2049, 13, 1, 0.918 2944, 0, 0, 0, 1.014 2944, 0, 0, 1, 1.015 2944, 14, 0, 0, 0.961 2944, 14, 0, 1, 0.961 2944, 46, 0, 0, 0.979 2944, 46, 0, 1, 0.979 2944, 0, 14, 0, 0.934 2944, 0, 14, 1, 0.937 2944, 0, 46, 0, 0.924 2944, 0, 46, 1, 0.921 2944, 14, 14, 0, 0.953 2944, 14, 14, 1, 0.953 2944, 46, 46, 0, 1.009 2944, 46, 46, 1, 1.009 2944, 2048, 0, 0, 1.015 2944, 2048, 0, 1, 1.015 2944, 2062, 0, 0, 0.961 2944, 2062, 0, 1, 0.961 2944, 2048, 14, 0, 0.769 2944, 2048, 14, 1, 0.769 2944, 2062, 14, 0, 0.923 2944, 2062, 14, 1, 0.923 2944, 14, 1, 0, 0.999 2944, 14, 1, 1, 0.999 2944, 1, 14, 0, 0.927 2944, 1, 14, 1, 0.927 2944, 46, 1, 0, 1.027 2944, 46, 1, 1, 1.027 2944, 1, 46, 0, 0.918 2944, 1, 46, 1, 0.918 2944, 2062, 1, 0, 0.995 2944, 2062, 1, 1, 0.995 2944, 2049, 14, 0, 0.922 2944, 2049, 14, 1, 0.922 3008, 0, 0, 0, 0.998 3008, 0, 0, 1, 0.997 3008, 15, 0, 0, 0.953 3008, 15, 0, 1, 0.953 3008, 47, 0, 0, 0.996 3008, 47, 0, 1, 0.996 3008, 0, 15, 0, 0.933 3008, 0, 15, 1, 0.929 3008, 0, 47, 0, 0.933 3008, 0, 47, 1, 0.933 3008, 15, 15, 0, 0.95 3008, 15, 15, 1, 0.949 3008, 47, 47, 0, 1.003 3008, 47, 47, 1, 1.003 3008, 2048, 0, 0, 0.998 3008, 2048, 0, 1, 0.998 3008, 2063, 0, 0, 0.953 3008, 2063, 0, 1, 0.953 3008, 2048, 15, 0, 0.766 3008, 2048, 15, 1, 0.766 3008, 2063, 15, 0, 0.916 3008, 2063, 15, 1, 0.916 3008, 15, 1, 0, 0.996 3008, 15, 1, 1, 0.996 3008, 1, 15, 0, 0.927 3008, 1, 15, 1, 0.927 3008, 47, 1, 0, 1.026 3008, 47, 1, 1, 1.026 3008, 1, 47, 0, 0.918 3008, 1, 47, 1, 0.918 3008, 2063, 1, 0, 0.994 3008, 2063, 1, 1, 0.994 3008, 2049, 15, 0, 0.925 3008, 2049, 15, 1, 0.925 3072, 0, 0, 0, 1.015 3072, 0, 0, 1, 1.016 3072, 16, 0, 0, 1.045 3072, 16, 0, 1, 1.045 3072, 48, 0, 0, 1.045 3072, 48, 0, 1, 1.045 3072, 0, 16, 0, 1.049 3072, 0, 16, 1, 1.049 3072, 0, 48, 0, 1.049 3072, 0, 48, 1, 1.049 3072, 16, 16, 0, 1.016 3072, 16, 16, 1, 1.015 3072, 48, 48, 0, 1.015 3072, 48, 48, 1, 1.016 3072, 2048, 0, 0, 1.016 3072, 2048, 0, 1, 1.016 3072, 2064, 0, 0, 1.045 3072, 2064, 0, 1, 1.045 3072, 2048, 16, 0, 1.049 3072, 2048, 16, 1, 1.049 3072, 2064, 16, 0, 1.016 3072, 2064, 16, 1, 1.016 3072, 16, 1, 0, 0.815 3072, 16, 1, 1, 0.815 3072, 1, 16, 0, 0.872 3072, 1, 16, 1, 0.872 3072, 48, 1, 0, 1.017 3072, 48, 1, 1, 1.017 3072, 1, 48, 0, 0.872 3072, 1, 48, 1, 0.872 3072, 2064, 1, 0, 0.815 3072, 2064, 1, 1, 0.815 3072, 2049, 16, 0, 0.872 3072, 2049, 16, 1, 0.872 3136, 0, 0, 0, 0.995 3136, 0, 0, 1, 0.996 3136, 17, 0, 0, 0.949 3136, 17, 0, 1, 0.949 3136, 49, 0, 0, 0.987 3136, 49, 0, 1, 0.987 3136, 0, 17, 0, 0.922 3136, 0, 17, 1, 0.919 3136, 0, 49, 0, 0.931 3136, 0, 49, 1, 0.931 3136, 17, 17, 0, 1.122 3136, 17, 17, 1, 1.119 3136, 49, 49, 0, 0.987 3136, 49, 49, 1, 0.987 3136, 2048, 0, 0, 0.997 3136, 2048, 0, 1, 0.997 3136, 2065, 0, 0, 0.949 3136, 2065, 0, 1, 0.949 3136, 2048, 17, 0, 0.896 3136, 2048, 17, 1, 0.896 3136, 2065, 17, 0, 1.122 3136, 2065, 17, 1, 1.12 3136, 17, 1, 0, 1.185 3136, 17, 1, 1, 1.185 3136, 1, 17, 0, 1.124 3136, 1, 17, 1, 1.124 3136, 49, 1, 0, 1.11 3136, 49, 1, 1, 1.109 3136, 1, 49, 0, 1.044 3136, 1, 49, 1, 1.044 3136, 2065, 1, 0, 1.147 3136, 2065, 1, 1, 1.147 3136, 2049, 17, 0, 1.103 3136, 2049, 17, 1, 1.103 3200, 0, 0, 0, 1.006 3200, 0, 0, 1, 1.006 3200, 18, 0, 0, 0.978 3200, 18, 0, 1, 0.978 3200, 50, 0, 0, 0.998 3200, 50, 0, 1, 0.998 3200, 0, 18, 0, 0.932 3200, 0, 18, 1, 0.932 3200, 0, 50, 0, 0.93 3200, 0, 50, 1, 0.93 3200, 18, 18, 0, 1.11 3200, 18, 18, 1, 1.11 3200, 50, 50, 0, 0.994 3200, 50, 50, 1, 0.994 3200, 2048, 0, 0, 1.007 3200, 2048, 0, 1, 1.007 3200, 2066, 0, 0, 0.978 3200, 2066, 0, 1, 0.978 3200, 2048, 18, 0, 0.894 3200, 2048, 18, 1, 0.894 3200, 2066, 18, 0, 1.11 3200, 2066, 18, 1, 1.11 3200, 18, 1, 0, 1.002 3200, 18, 1, 1, 1.002 3200, 1, 18, 0, 0.917 3200, 1, 18, 1, 0.917 3200, 50, 1, 0, 0.963 3200, 50, 1, 1, 0.964 3200, 1, 50, 0, 0.888 3200, 1, 50, 1, 0.888 3200, 2066, 1, 0, 1.002 3200, 2066, 1, 1, 1.002 3200, 2049, 18, 0, 0.914 3200, 2049, 18, 1, 0.914 3264, 0, 0, 0, 0.994 3264, 0, 0, 1, 0.994 3264, 19, 0, 0, 0.959 3264, 19, 0, 1, 0.959 3264, 51, 0, 0, 0.994 3264, 51, 0, 1, 0.994 3264, 0, 19, 0, 0.927 3264, 0, 19, 1, 0.927 3264, 0, 51, 0, 0.927 3264, 0, 51, 1, 0.927 3264, 19, 19, 0, 1.1 3264, 19, 19, 1, 1.099 3264, 51, 51, 0, 0.982 3264, 51, 51, 1, 0.982 3264, 2048, 0, 0, 0.994 3264, 2048, 0, 1, 0.994 3264, 2067, 0, 0, 0.959 3264, 2067, 0, 1, 0.959 3264, 2048, 19, 0, 0.891 3264, 2048, 19, 1, 0.891 3264, 2067, 19, 0, 1.099 3264, 2067, 19, 1, 1.099 3264, 19, 1, 0, 0.977 3264, 19, 1, 1, 0.976 3264, 1, 19, 0, 0.921 3264, 1, 19, 1, 0.921 3264, 51, 1, 0, 0.959 3264, 51, 1, 1, 0.959 3264, 1, 51, 0, 0.886 3264, 1, 51, 1, 0.886 3264, 2067, 1, 0, 0.976 3264, 2067, 1, 1, 0.976 3264, 2049, 19, 0, 0.917 3264, 2049, 19, 1, 0.917 3328, 0, 0, 0, 0.997 3328, 0, 0, 1, 0.993 3328, 20, 0, 0, 0.955 3328, 20, 0, 1, 0.955 3328, 52, 0, 0, 0.99 3328, 52, 0, 1, 0.99 3328, 0, 20, 0, 0.925 3328, 0, 20, 1, 0.927 3328, 0, 52, 0, 0.933 3328, 0, 52, 1, 0.933 3328, 20, 20, 0, 1.11 3328, 20, 20, 1, 1.11 3328, 52, 52, 0, 0.988 3328, 52, 52, 1, 0.988 3328, 2048, 0, 0, 0.996 3328, 2048, 0, 1, 0.993 3328, 2068, 0, 0, 0.955 3328, 2068, 0, 1, 0.955 3328, 2048, 20, 0, 0.9 3328, 2048, 20, 1, 0.9 3328, 2068, 20, 0, 1.109 3328, 2068, 20, 1, 1.109 3328, 20, 1, 0, 0.996 3328, 20, 1, 1, 0.996 3328, 1, 20, 0, 0.927 3328, 1, 20, 1, 0.927 3328, 52, 1, 0, 0.972 3328, 52, 1, 1, 0.972 3328, 1, 52, 0, 0.901 3328, 1, 52, 1, 0.901 3328, 2068, 1, 0, 0.996 3328, 2068, 1, 1, 0.996 3328, 2049, 20, 0, 0.924 3328, 2049, 20, 1, 0.924 3392, 0, 0, 0, 0.996 3392, 0, 0, 1, 1.0 3392, 21, 0, 0, 0.964 3392, 21, 0, 1, 0.964 3392, 53, 0, 0, 0.999 3392, 53, 0, 1, 0.999 3392, 0, 21, 0, 0.932 3392, 0, 21, 1, 0.932 3392, 0, 53, 0, 0.93 3392, 0, 53, 1, 0.93 3392, 21, 21, 0, 1.113 3392, 21, 21, 1, 1.113 3392, 53, 53, 0, 0.983 3392, 53, 53, 1, 0.983 3392, 2048, 0, 0, 1.0 3392, 2048, 0, 1, 1.0 3392, 2069, 0, 0, 0.964 3392, 2069, 0, 1, 0.964 3392, 2048, 21, 0, 0.896 3392, 2048, 21, 1, 0.896 3392, 2069, 21, 0, 1.113 3392, 2069, 21, 1, 1.113 3392, 21, 1, 0, 0.994 3392, 21, 1, 1, 0.994 3392, 1, 21, 0, 0.918 3392, 1, 21, 1, 0.918 3392, 53, 1, 0, 0.972 3392, 53, 1, 1, 0.972 3392, 1, 53, 0, 0.891 3392, 1, 53, 1, 0.891 3392, 2069, 1, 0, 0.994 3392, 2069, 1, 1, 0.994 3392, 2049, 21, 0, 0.915 3392, 2049, 21, 1, 0.915 3456, 0, 0, 0, 0.995 3456, 0, 0, 1, 0.995 3456, 22, 0, 0, 0.965 3456, 22, 0, 1, 0.965 3456, 54, 0, 0, 0.996 3456, 54, 0, 1, 0.996 3456, 0, 22, 0, 0.927 3456, 0, 22, 1, 0.927 3456, 0, 54, 0, 0.927 3456, 0, 54, 1, 0.927 3456, 22, 22, 0, 1.106 3456, 22, 22, 1, 1.107 3456, 54, 54, 0, 0.98 3456, 54, 54, 1, 0.98 3456, 2048, 0, 0, 0.995 3456, 2048, 0, 1, 0.995 3456, 2070, 0, 0, 0.965 3456, 2070, 0, 1, 0.965 3456, 2048, 22, 0, 0.893 3456, 2048, 22, 1, 0.893 3456, 2070, 22, 0, 1.107 3456, 2070, 22, 1, 1.107 3456, 22, 1, 0, 0.988 3456, 22, 1, 1, 0.988 3456, 1, 22, 0, 0.915 3456, 1, 22, 1, 0.915 3456, 54, 1, 0, 0.963 3456, 54, 1, 1, 0.963 3456, 1, 54, 0, 0.887 3456, 1, 54, 1, 0.887 3456, 2070, 1, 0, 0.988 3456, 2070, 1, 1, 0.988 3456, 2049, 22, 0, 0.911 3456, 2049, 22, 1, 0.911 3520, 0, 0, 0, 1.016 3520, 0, 0, 1, 1.016 3520, 23, 0, 0, 0.957 3520, 23, 0, 1, 0.957 3520, 55, 0, 0, 0.991 3520, 55, 0, 1, 0.991 3520, 0, 23, 0, 0.918 3520, 0, 23, 1, 0.929 3520, 0, 55, 0, 0.935 3520, 0, 55, 1, 0.934 3520, 23, 23, 0, 1.111 3520, 23, 23, 1, 1.111 3520, 55, 55, 0, 0.994 3520, 55, 55, 1, 0.994 3520, 2048, 0, 0, 1.016 3520, 2048, 0, 1, 1.016 3520, 2071, 0, 0, 0.957 3520, 2071, 0, 1, 0.957 3520, 2048, 23, 0, 0.903 3520, 2048, 23, 1, 0.902 3520, 2071, 23, 0, 1.111 3520, 2071, 23, 1, 1.111 3520, 23, 1, 0, 0.997 3520, 23, 1, 1, 0.997 3520, 1, 23, 0, 0.926 3520, 1, 23, 1, 0.927 3520, 55, 1, 0, 0.976 3520, 55, 1, 1, 0.976 3520, 1, 55, 0, 0.902 3520, 1, 55, 1, 0.902 3520, 2071, 1, 0, 0.997 3520, 2071, 1, 1, 0.997 3520, 2049, 23, 0, 0.924 3520, 2049, 23, 1, 0.924 3584, 0, 0, 0, 1.005 3584, 0, 0, 1, 1.004 3584, 24, 0, 0, 0.985 3584, 24, 0, 1, 0.979 3584, 56, 0, 0, 1.006 3584, 56, 0, 1, 1.006 3584, 0, 24, 0, 0.931 3584, 0, 24, 1, 0.931 3584, 0, 56, 0, 0.93 3584, 0, 56, 1, 0.93 3584, 24, 24, 0, 1.111 3584, 24, 24, 1, 1.11 3584, 56, 56, 0, 1.102 3584, 56, 56, 1, 1.101 3584, 2048, 0, 0, 1.006 3584, 2048, 0, 1, 1.005 3584, 2072, 0, 0, 0.983 3584, 2072, 0, 1, 0.977 3584, 2048, 24, 0, 0.896 3584, 2048, 24, 1, 0.897 3584, 2072, 24, 0, 1.111 3584, 2072, 24, 1, 1.111 3584, 24, 1, 0, 1.004 3584, 24, 1, 1, 1.004 3584, 1, 24, 0, 0.921 3584, 1, 24, 1, 0.921 3584, 56, 1, 0, 0.97 3584, 56, 1, 1, 0.97 3584, 1, 56, 0, 0.891 3584, 1, 56, 1, 0.891 3584, 2072, 1, 0, 1.004 3584, 2072, 1, 1, 1.004 3584, 2049, 24, 0, 0.918 3584, 2049, 24, 1, 0.918 3648, 0, 0, 0, 1.012 3648, 0, 0, 1, 1.012 3648, 25, 0, 0, 0.96 3648, 25, 0, 1, 0.96 3648, 57, 0, 0, 0.988 3648, 57, 0, 1, 0.988 3648, 0, 25, 0, 0.927 3648, 0, 25, 1, 0.927 3648, 0, 57, 0, 0.927 3648, 0, 57, 1, 0.927 3648, 25, 25, 0, 1.1 3648, 25, 25, 1, 1.1 3648, 57, 57, 0, 0.986 3648, 57, 57, 1, 0.986 3648, 2048, 0, 0, 1.012 3648, 2048, 0, 1, 1.012 3648, 2073, 0, 0, 0.96 3648, 2073, 0, 1, 0.96 3648, 2048, 25, 0, 0.895 3648, 2048, 25, 1, 0.894 3648, 2073, 25, 0, 1.103 3648, 2073, 25, 1, 1.103 3648, 25, 1, 0, 1.032 3648, 25, 1, 1, 1.032 3648, 1, 25, 0, 0.9 3648, 1, 25, 1, 0.901 3648, 57, 1, 0, 0.974 3648, 57, 1, 1, 0.974 3648, 1, 57, 0, 0.888 3648, 1, 57, 1, 0.888 3648, 2073, 1, 0, 1.032 3648, 2073, 1, 1, 1.032 3648, 2049, 25, 0, 0.895 3648, 2049, 25, 1, 0.896 3712, 0, 0, 0, 0.996 3712, 0, 0, 1, 0.996 3712, 26, 0, 0, 0.959 3712, 26, 0, 1, 0.959 3712, 58, 0, 0, 0.995 3712, 58, 0, 1, 0.995 3712, 0, 26, 0, 0.92 3712, 0, 26, 1, 0.919 3712, 0, 58, 0, 0.931 3712, 0, 58, 1, 0.931 3712, 26, 26, 0, 1.103 3712, 26, 26, 1, 1.101 3712, 58, 58, 0, 0.99 3712, 58, 58, 1, 0.989 3712, 2048, 0, 0, 0.997 3712, 2048, 0, 1, 0.997 3712, 2074, 0, 0, 0.959 3712, 2074, 0, 1, 0.959 3712, 2048, 26, 0, 0.901 3712, 2048, 26, 1, 0.901 3712, 2074, 26, 0, 1.103 3712, 2074, 26, 1, 1.103 3712, 26, 1, 0, 1.001 3712, 26, 1, 1, 1.001 3712, 1, 26, 0, 0.928 3712, 1, 26, 1, 0.928 3712, 58, 1, 0, 0.974 3712, 58, 1, 1, 0.974 3712, 1, 58, 0, 0.903 3712, 1, 58, 1, 0.902 3712, 2074, 1, 0, 1.001 3712, 2074, 1, 1, 1.001 3712, 2049, 26, 0, 0.925 3712, 2049, 26, 1, 0.925 3776, 0, 0, 0, 1.003 3776, 0, 0, 1, 1.003 3776, 27, 0, 0, 0.964 3776, 27, 0, 1, 0.963 3776, 59, 0, 0, 1.004 3776, 59, 0, 1, 1.004 3776, 0, 27, 0, 0.931 3776, 0, 27, 1, 0.931 3776, 0, 59, 0, 0.929 3776, 0, 59, 1, 0.929 3776, 27, 27, 0, 1.097 3776, 27, 27, 1, 1.097 3776, 59, 59, 0, 0.992 3776, 59, 59, 1, 0.992 3776, 2048, 0, 0, 1.003 3776, 2048, 0, 1, 1.003 3776, 2075, 0, 0, 0.964 3776, 2075, 0, 1, 0.963 3776, 2048, 27, 0, 0.898 3776, 2048, 27, 1, 0.898 3776, 2075, 27, 0, 1.097 3776, 2075, 27, 1, 1.097 3776, 27, 1, 0, 0.991 3776, 27, 1, 1, 0.991 3776, 1, 27, 0, 0.919 3776, 1, 27, 1, 0.919 3776, 59, 1, 0, 0.979 3776, 59, 1, 1, 0.979 3776, 1, 59, 0, 0.894 3776, 1, 59, 1, 0.894 3776, 2075, 1, 0, 0.991 3776, 2075, 1, 1, 0.991 3776, 2049, 27, 0, 0.916 3776, 2049, 27, 1, 0.917 3840, 0, 0, 0, 0.998 3840, 0, 0, 1, 0.998 3840, 28, 0, 0, 0.968 3840, 28, 0, 1, 0.968 3840, 60, 0, 0, 1.001 3840, 60, 0, 1, 1.001 3840, 0, 28, 0, 0.927 3840, 0, 28, 1, 0.927 3840, 0, 60, 0, 0.927 3840, 0, 60, 1, 0.927 3840, 28, 28, 0, 1.094 3840, 28, 28, 1, 1.094 3840, 60, 60, 0, 0.982 3840, 60, 60, 1, 0.982 3840, 2048, 0, 0, 0.998 3840, 2048, 0, 1, 0.998 3840, 2076, 0, 0, 0.968 3840, 2076, 0, 1, 0.968 3840, 2048, 28, 0, 0.896 3840, 2048, 28, 1, 0.896 3840, 2076, 28, 0, 1.094 3840, 2076, 28, 1, 1.094 3840, 28, 1, 0, 0.99 3840, 28, 1, 1, 0.99 3840, 1, 28, 0, 0.91 3840, 1, 28, 1, 0.91 3840, 60, 1, 0, 0.969 3840, 60, 1, 1, 0.969 3840, 1, 60, 0, 0.89 3840, 1, 60, 1, 0.891 3840, 2076, 1, 0, 0.99 3840, 2076, 1, 1, 0.99 3840, 2049, 28, 0, 0.906 3840, 2049, 28, 1, 0.906 3904, 0, 0, 0, 1.001 3904, 0, 0, 1, 0.998 3904, 29, 0, 0, 0.961 3904, 29, 0, 1, 0.961 3904, 61, 0, 0, 0.997 3904, 61, 0, 1, 0.997 3904, 0, 29, 0, 0.92 3904, 0, 29, 1, 0.926 3904, 0, 61, 0, 0.933 3904, 0, 61, 1, 0.933 3904, 29, 29, 0, 1.103 3904, 29, 29, 1, 1.103 3904, 61, 61, 0, 0.995 3904, 61, 61, 1, 0.995 3904, 2048, 0, 0, 0.998 3904, 2048, 0, 1, 0.998 3904, 2077, 0, 0, 0.961 3904, 2077, 0, 1, 0.961 3904, 2048, 29, 0, 0.904 3904, 2048, 29, 1, 0.904 3904, 2077, 29, 0, 1.102 3904, 2077, 29, 1, 1.102 3904, 29, 1, 0, 1.0 3904, 29, 1, 1, 1.0 3904, 1, 29, 0, 0.911 3904, 1, 29, 1, 0.911 3904, 61, 1, 0, 0.98 3904, 61, 1, 1, 0.98 3904, 1, 61, 0, 0.904 3904, 1, 61, 1, 0.904 3904, 2077, 1, 0, 1.0 3904, 2077, 1, 1, 1.0 3904, 2049, 29, 0, 0.906 3904, 2049, 29, 1, 0.907 3968, 0, 0, 0, 1.003 3968, 0, 0, 1, 1.003 3968, 30, 0, 0, 0.969 3968, 30, 0, 1, 0.969 3968, 62, 0, 0, 1.005 3968, 62, 0, 1, 1.006 3968, 0, 30, 0, 0.931 3968, 0, 30, 1, 0.931 3968, 0, 62, 0, 0.93 3968, 0, 62, 1, 0.93 3968, 30, 30, 0, 1.103 3968, 30, 30, 1, 1.103 3968, 62, 62, 0, 0.99 3968, 62, 62, 1, 0.99 3968, 2048, 0, 0, 1.004 3968, 2048, 0, 1, 1.004 3968, 2078, 0, 0, 0.968 3968, 2078, 0, 1, 0.969 3968, 2048, 30, 0, 0.899 3968, 2048, 30, 1, 0.899 3968, 2078, 30, 0, 1.105 3968, 2078, 30, 1, 1.105 3968, 30, 1, 0, 0.993 3968, 30, 1, 1, 0.993 3968, 1, 30, 0, 0.914 3968, 1, 30, 1, 0.913 3968, 62, 1, 0, 0.978 3968, 62, 1, 1, 0.978 3968, 1, 62, 0, 0.895 3968, 1, 62, 1, 0.895 3968, 2078, 1, 0, 0.993 3968, 2078, 1, 1, 0.993 3968, 2049, 30, 0, 0.911 3968, 2049, 30, 1, 0.911 4032, 0, 0, 0, 0.995 4032, 0, 0, 1, 0.995 4032, 31, 0, 0, 0.967 4032, 31, 0, 1, 0.967 4032, 63, 0, 0, 1.003 4032, 63, 0, 1, 1.002 4032, 0, 31, 0, 0.927 4032, 0, 31, 1, 0.927 4032, 0, 63, 0, 0.927 4032, 0, 63, 1, 0.927 4032, 31, 31, 0, 1.09 4032, 31, 31, 1, 1.09 4032, 63, 63, 0, 0.987 4032, 63, 63, 1, 0.987 4032, 2048, 0, 0, 0.995 4032, 2048, 0, 1, 0.995 4032, 2079, 0, 0, 0.967 4032, 2079, 0, 1, 0.967 4032, 2048, 31, 0, 0.897 4032, 2048, 31, 1, 0.897 4032, 2079, 31, 0, 1.09 4032, 2079, 31, 1, 1.09 4032, 31, 1, 0, 0.989 4032, 31, 1, 1, 0.989 4032, 1, 31, 0, 0.922 4032, 1, 31, 1, 0.923 4032, 63, 1, 0, 0.971 4032, 63, 1, 1, 0.972 4032, 1, 63, 0, 0.892 4032, 1, 63, 1, 0.892 4032, 2079, 1, 0, 0.988 4032, 2079, 1, 1, 0.988 4032, 2049, 31, 0, 0.919 4032, 2049, 31, 1, 0.919 4096, 32, 0, 0, 1.014 4096, 32, 0, 1, 1.014 4096, 64, 0, 0, 1.014 4096, 64, 0, 1, 1.014 4096, 0, 32, 0, 1.013 4096, 0, 32, 1, 1.013 4096, 0, 64, 0, 1.013 4096, 0, 64, 1, 1.013 4096, 32, 32, 0, 1.014 4096, 32, 32, 1, 1.014 4096, 64, 64, 0, 1.014 4096, 64, 64, 1, 1.014 4096, 2080, 0, 0, 1.014 4096, 2080, 0, 1, 1.014 4096, 2048, 32, 0, 1.014 4096, 2048, 32, 1, 1.014 4096, 2080, 32, 0, 1.014 4096, 2080, 32, 1, 1.014 4096, 32, 1, 0, 0.975 4096, 32, 1, 1, 0.975 4096, 1, 32, 0, 0.769 4096, 1, 32, 1, 0.769 4096, 64, 1, 0, 0.858 4096, 64, 1, 1, 0.858 4096, 1, 64, 0, 0.769 4096, 1, 64, 1, 0.769 4096, 2080, 1, 0, 0.829 4096, 2080, 1, 1, 0.829 4096, 2049, 32, 0, 0.886 4096, 2049, 32, 1, 0.886 4160, 0, 0, 0, 1.003 4160, 0, 0, 1, 1.003 4160, 33, 0, 0, 1.004 4160, 33, 0, 1, 1.004 4160, 65, 0, 0, 0.999 4160, 65, 0, 1, 0.999 4160, 0, 33, 0, 0.931 4160, 0, 33, 1, 0.931 4160, 0, 65, 0, 0.765 4160, 0, 65, 1, 0.765 4160, 33, 33, 0, 0.998 4160, 33, 33, 1, 0.998 4160, 65, 65, 0, 0.942 4160, 65, 65, 1, 0.942 4160, 2048, 0, 0, 1.003 4160, 2048, 0, 1, 1.003 4160, 2081, 0, 0, 1.005 4160, 2081, 0, 1, 1.005 4160, 2048, 33, 0, 0.899 4160, 2048, 33, 1, 0.899 4160, 2081, 33, 0, 1.002 4160, 2081, 33, 1, 1.002 4160, 33, 1, 0, 1.114 4160, 33, 1, 1, 1.114 4160, 1, 33, 0, 1.01 4160, 1, 33, 1, 1.01 4160, 65, 1, 0, 1.077 4160, 65, 1, 1, 1.077 4160, 1, 65, 0, 0.935 4160, 1, 65, 1, 0.936 4160, 2081, 1, 0, 1.077 4160, 2081, 1, 1, 1.077 4160, 2049, 33, 0, 1.008 4160, 2049, 33, 1, 1.007 4224, 0, 0, 0, 1.014 4224, 0, 0, 1, 1.014 4224, 34, 0, 0, 1.0 4224, 34, 0, 1, 1.0 4224, 66, 0, 0, 1.001 4224, 66, 0, 1, 1.001 4224, 0, 34, 0, 0.928 4224, 0, 34, 1, 0.928 4224, 0, 66, 0, 0.762 4224, 0, 66, 1, 0.762 4224, 34, 34, 0, 0.998 4224, 34, 34, 1, 0.998 4224, 66, 66, 0, 0.959 4224, 66, 66, 1, 0.959 4224, 2048, 0, 0, 1.014 4224, 2048, 0, 1, 1.014 4224, 2082, 0, 0, 1.001 4224, 2082, 0, 1, 1.001 4224, 2048, 34, 0, 0.899 4224, 2048, 34, 1, 0.898 4224, 2082, 34, 0, 0.998 4224, 2082, 34, 1, 0.997 4224, 34, 1, 0, 1.024 4224, 34, 1, 1, 1.024 4224, 1, 34, 0, 0.923 4224, 1, 34, 1, 0.923 4224, 66, 1, 0, 1.013 4224, 66, 1, 1, 1.013 4224, 1, 66, 0, 0.917 4224, 1, 66, 1, 0.917 4224, 2082, 1, 0, 1.022 4224, 2082, 1, 1, 1.022 4224, 2049, 34, 0, 0.92 4224, 2049, 34, 1, 0.92 4288, 0, 0, 0, 0.999 4288, 0, 0, 1, 0.999 4288, 35, 0, 0, 0.995 4288, 35, 0, 1, 0.996 4288, 67, 0, 0, 0.998 4288, 67, 0, 1, 0.998 4288, 0, 35, 0, 0.917 4288, 0, 35, 1, 0.919 4288, 0, 67, 0, 0.767 4288, 0, 67, 1, 0.767 4288, 35, 35, 0, 1.004 4288, 35, 35, 1, 1.004 4288, 67, 67, 0, 0.995 4288, 67, 67, 1, 0.995 4288, 2048, 0, 0, 0.999 4288, 2048, 0, 1, 0.999 4288, 2083, 0, 0, 0.995 4288, 2083, 0, 1, 0.995 4288, 2048, 35, 0, 0.905 4288, 2048, 35, 1, 0.904 4288, 2083, 35, 0, 1.004 4288, 2083, 35, 1, 1.004 4288, 35, 1, 0, 1.032 4288, 35, 1, 1, 1.033 4288, 1, 35, 0, 0.928 4288, 1, 35, 1, 0.928 4288, 67, 1, 0, 1.019 4288, 67, 1, 1, 1.019 4288, 1, 67, 0, 0.924 4288, 1, 67, 1, 0.924 4288, 2083, 1, 0, 1.03 4288, 2083, 1, 1, 1.031 4288, 2049, 35, 0, 0.925 4288, 2049, 35, 1, 0.925 4352, 0, 0, 0, 1.005 4352, 0, 0, 1, 1.006 4352, 36, 0, 0, 1.006 4352, 36, 0, 1, 1.007 4352, 68, 0, 0, 1.006 4352, 68, 0, 1, 1.007 4352, 0, 36, 0, 0.929 4352, 0, 36, 1, 0.928 4352, 0, 68, 0, 0.766 4352, 0, 68, 1, 0.765 4352, 36, 36, 0, 0.998 4352, 36, 36, 1, 0.998 4352, 68, 68, 0, 0.964 4352, 68, 68, 1, 0.964 4352, 2048, 0, 0, 1.006 4352, 2048, 0, 1, 1.006 4352, 2084, 0, 0, 1.007 4352, 2084, 0, 1, 1.007 4352, 2048, 36, 0, 0.897 4352, 2048, 36, 1, 0.898 4352, 2084, 36, 0, 0.998 4352, 2084, 36, 1, 0.998 4352, 36, 1, 0, 1.031 4352, 36, 1, 1, 1.031 4352, 1, 36, 0, 0.924 4352, 1, 36, 1, 0.925 4352, 68, 1, 0, 0.999 4352, 68, 1, 1, 0.999 4352, 1, 68, 0, 0.922 4352, 1, 68, 1, 0.922 4352, 2084, 1, 0, 1.032 4352, 2084, 1, 1, 1.03 4352, 2049, 36, 0, 0.923 4352, 2049, 36, 1, 0.923 4416, 0, 0, 0, 0.997 4416, 0, 0, 1, 0.997 4416, 37, 0, 0, 1.001 4416, 37, 0, 1, 1.002 4416, 69, 0, 0, 1.004 4416, 69, 0, 1, 1.003 4416, 0, 37, 0, 0.928 4416, 0, 37, 1, 0.927 4416, 0, 69, 0, 0.762 4416, 0, 69, 1, 0.763 4416, 37, 37, 0, 0.994 4416, 37, 37, 1, 0.994 4416, 69, 69, 0, 0.959 4416, 69, 69, 1, 0.959 4416, 2048, 0, 0, 0.997 4416, 2048, 0, 1, 0.997 4416, 2085, 0, 0, 1.002 4416, 2085, 0, 1, 1.001 4416, 2048, 37, 0, 0.9 4416, 2048, 37, 1, 0.9 4416, 2085, 37, 0, 0.994 4416, 2085, 37, 1, 0.994 4416, 37, 1, 0, 1.024 4416, 37, 1, 1, 1.025 4416, 1, 37, 0, 0.922 4416, 1, 37, 1, 0.922 4416, 69, 1, 0, 1.008 4416, 69, 1, 1, 1.009 4416, 1, 69, 0, 0.913 4416, 1, 69, 1, 0.912 4416, 2085, 1, 0, 1.025 4416, 2085, 1, 1, 1.024 4416, 2049, 37, 0, 0.92 4416, 2049, 37, 1, 0.919 4480, 0, 0, 0, 1.0 4480, 0, 0, 1, 0.998 4480, 38, 0, 0, 0.996 4480, 38, 0, 1, 0.996 4480, 70, 0, 0, 0.992 4480, 70, 0, 1, 0.992 4480, 0, 38, 0, 0.919 4480, 0, 38, 1, 0.916 4480, 0, 70, 0, 0.767 4480, 0, 70, 1, 0.767 4480, 38, 38, 0, 1.002 4480, 38, 38, 1, 1.002 4480, 70, 70, 0, 0.963 4480, 70, 70, 1, 0.963 4480, 2048, 0, 0, 0.998 4480, 2048, 0, 1, 0.998 4480, 2086, 0, 0, 0.996 4480, 2086, 0, 1, 0.996 4480, 2048, 38, 0, 0.907 4480, 2048, 38, 1, 0.907 4480, 2086, 38, 0, 1.002 4480, 2086, 38, 1, 1.002 4480, 38, 1, 0, 1.023 4480, 38, 1, 1, 1.024 4480, 1, 38, 0, 0.914 4480, 1, 38, 1, 0.913 4480, 70, 1, 0, 1.01 4480, 70, 1, 1, 1.011 4480, 1, 70, 0, 0.922 4480, 1, 70, 1, 0.922 4480, 2086, 1, 0, 1.024 4480, 2086, 1, 1, 1.024 4480, 2049, 38, 0, 0.911 4480, 2049, 38, 1, 0.91 4544, 0, 0, 0, 1.002 4544, 0, 0, 1, 1.002 4544, 39, 0, 0, 1.007 4544, 39, 0, 1, 1.007 4544, 71, 0, 0, 1.01 4544, 71, 0, 1, 1.008 4544, 0, 39, 0, 0.93 4544, 0, 39, 1, 0.93 4544, 0, 71, 0, 0.766 4544, 0, 71, 1, 0.766 4544, 39, 39, 0, 1.001 4544, 39, 39, 1, 1.001 4544, 71, 71, 0, 0.966 4544, 71, 71, 1, 0.966 4544, 2048, 0, 0, 1.002 4544, 2048, 0, 1, 1.002 4544, 2087, 0, 0, 1.008 4544, 2087, 0, 1, 1.008 4544, 2048, 39, 0, 0.901 4544, 2048, 39, 1, 0.902 4544, 2087, 39, 0, 1.001 4544, 2087, 39, 1, 1.001 4544, 39, 1, 0, 1.032 4544, 39, 1, 1, 1.032 4544, 1, 39, 0, 0.925 4544, 1, 39, 1, 0.925 4544, 71, 1, 0, 0.997 4544, 71, 1, 1, 0.998 4544, 1, 71, 0, 0.921 4544, 1, 71, 1, 0.922 4544, 2087, 1, 0, 1.032 4544, 2087, 1, 1, 1.032 4544, 2049, 39, 0, 0.924 4544, 2049, 39, 1, 0.923 4608, 0, 0, 0, 0.999 4608, 0, 0, 1, 0.998 4608, 40, 0, 0, 1.013 4608, 40, 0, 1, 1.012 4608, 72, 0, 0, 1.013 4608, 72, 0, 1, 1.013 4608, 0, 40, 0, 0.925 4608, 0, 40, 1, 0.926 4608, 0, 72, 0, 0.765 4608, 0, 72, 1, 0.765 4608, 40, 40, 0, 1.085 4608, 40, 40, 1, 1.086 4608, 72, 72, 0, 0.966 4608, 72, 72, 1, 0.966 4608, 2048, 0, 0, 0.999 4608, 2048, 0, 1, 0.999 4608, 2088, 0, 0, 1.012 4608, 2088, 0, 1, 1.013 4608, 2048, 40, 0, 0.898 4608, 2048, 40, 1, 0.898 4608, 2088, 40, 0, 1.087 4608, 2088, 40, 1, 1.087 4608, 40, 1, 0, 1.006 4608, 40, 1, 1, 1.007 4608, 1, 40, 0, 0.919 4608, 1, 40, 1, 0.919 4608, 72, 1, 0, 1.012 4608, 72, 1, 1, 1.012 4608, 1, 72, 0, 0.914 4608, 1, 72, 1, 0.914 4608, 2088, 1, 0, 1.006 4608, 2088, 1, 1, 1.007 4608, 2049, 40, 0, 0.916 4608, 2049, 40, 1, 0.916 4672, 0, 0, 0, 1.014 4672, 0, 0, 1, 1.014 4672, 41, 0, 0, 1.002 4672, 41, 0, 1, 1.002 4672, 73, 0, 0, 0.976 4672, 73, 0, 1, 0.975 4672, 0, 41, 0, 0.919 4672, 0, 41, 1, 0.919 4672, 0, 73, 0, 0.772 4672, 0, 73, 1, 0.772 4672, 41, 41, 0, 1.012 4672, 41, 41, 1, 1.012 4672, 73, 73, 0, 0.973 4672, 73, 73, 1, 0.973 4672, 2048, 0, 0, 1.014 4672, 2048, 0, 1, 1.014 4672, 2089, 0, 0, 1.003 4672, 2089, 0, 1, 1.002 4672, 2048, 41, 0, 0.907 4672, 2048, 41, 1, 0.908 4672, 2089, 41, 0, 1.012 4672, 2089, 41, 1, 1.012 4672, 41, 1, 0, 1.02 4672, 41, 1, 1, 1.02 4672, 1, 41, 0, 0.916 4672, 1, 41, 1, 0.914 4672, 73, 1, 0, 1.024 4672, 73, 1, 1, 1.024 4672, 1, 73, 0, 0.927 4672, 1, 73, 1, 0.927 4672, 2089, 1, 0, 1.019 4672, 2089, 1, 1, 1.02 4672, 2049, 41, 0, 0.912 4672, 2049, 41, 1, 0.912 4736, 0, 0, 0, 1.007 4736, 0, 0, 1, 1.006 4736, 42, 0, 0, 1.012 4736, 42, 0, 1, 1.013 4736, 74, 0, 0, 0.976 4736, 74, 0, 1, 0.975 4736, 0, 42, 0, 0.93 4736, 0, 42, 1, 0.931 4736, 0, 74, 0, 0.769 4736, 0, 74, 1, 0.77 4736, 42, 42, 0, 1.007 4736, 42, 42, 1, 1.007 4736, 74, 74, 0, 0.965 4736, 74, 74, 1, 0.965 4736, 2048, 0, 0, 1.006 4736, 2048, 0, 1, 1.007 4736, 2090, 0, 0, 1.012 4736, 2090, 0, 1, 1.013 4736, 2048, 42, 0, 0.902 4736, 2048, 42, 1, 0.901 4736, 2090, 42, 0, 1.007 4736, 2090, 42, 1, 1.007 4736, 42, 1, 0, 1.032 4736, 42, 1, 1, 1.032 4736, 1, 42, 0, 0.919 4736, 1, 42, 1, 0.919 4736, 74, 1, 0, 1.017 4736, 74, 1, 1, 1.018 4736, 1, 74, 0, 0.919 4736, 1, 74, 1, 0.918 4736, 2090, 1, 0, 1.031 4736, 2090, 1, 1, 1.031 4736, 2049, 42, 0, 0.916 4736, 2049, 42, 1, 0.916 4800, 0, 0, 0, 1.012 4800, 0, 0, 1, 1.012 4800, 43, 0, 0, 1.008 4800, 43, 0, 1, 1.009 4800, 75, 0, 0, 0.99 4800, 75, 0, 1, 0.99 4800, 0, 43, 0, 0.929 4800, 0, 43, 1, 0.927 4800, 0, 75, 0, 0.768 4800, 0, 75, 1, 0.768 4800, 43, 43, 0, 1.004 4800, 43, 43, 1, 1.004 4800, 75, 75, 0, 0.965 4800, 75, 75, 1, 0.965 4800, 2048, 0, 0, 1.012 4800, 2048, 0, 1, 1.012 4800, 2091, 0, 0, 1.009 4800, 2091, 0, 1, 1.008 4800, 2048, 43, 0, 0.901 4800, 2048, 43, 1, 0.901 4800, 2091, 43, 0, 1.004 4800, 2091, 43, 1, 1.004 4800, 43, 1, 0, 1.026 4800, 43, 1, 1, 1.026 4800, 1, 43, 0, 0.923 4800, 1, 43, 1, 0.922 4800, 75, 1, 0, 0.993 4800, 75, 1, 1, 0.991 4800, 1, 75, 0, 0.921 4800, 1, 75, 1, 0.92 4800, 2091, 1, 0, 1.026 4800, 2091, 1, 1, 1.026 4800, 2049, 43, 0, 0.92 4800, 2049, 43, 1, 0.919 4864, 0, 0, 0, 0.999 4864, 0, 0, 1, 0.999 4864, 44, 0, 0, 0.998 4864, 44, 0, 1, 0.998 4864, 76, 0, 0, 0.981 4864, 76, 0, 1, 0.981 4864, 0, 44, 0, 0.916 4864, 0, 44, 1, 0.918 4864, 0, 76, 0, 0.772 4864, 0, 76, 1, 0.771 4864, 44, 44, 0, 1.006 4864, 44, 44, 1, 1.005 4864, 76, 76, 0, 0.97 4864, 76, 76, 1, 0.97 4864, 2048, 0, 0, 0.999 4864, 2048, 0, 1, 0.999 4864, 2092, 0, 0, 0.997 4864, 2092, 0, 1, 0.997 4864, 2048, 44, 0, 0.908 4864, 2048, 44, 1, 0.907 4864, 2092, 44, 0, 1.005 4864, 2092, 44, 1, 1.005 4864, 44, 1, 0, 0.893 4864, 44, 1, 1, 0.893 4864, 1, 44, 0, 0.922 4864, 1, 44, 1, 0.921 4864, 76, 1, 0, 0.866 4864, 76, 1, 1, 0.866 4864, 1, 76, 0, 0.919 4864, 1, 76, 1, 0.919 4864, 2092, 1, 0, 0.893 4864, 2092, 1, 1, 0.893 4864, 2049, 44, 0, 0.919 4864, 2049, 44, 1, 0.919 4928, 0, 0, 0, 1.005 4928, 0, 0, 1, 1.005 4928, 45, 0, 0, 1.005 4928, 45, 0, 1, 1.005 4928, 77, 0, 0, 0.97 4928, 77, 0, 1, 0.97 4928, 0, 45, 0, 0.931 4928, 0, 45, 1, 0.932 4928, 0, 77, 0, 0.771 4928, 0, 77, 1, 0.771 4928, 45, 45, 0, 1.0 4928, 45, 45, 1, 1.0 4928, 77, 77, 0, 0.972 4928, 77, 77, 1, 0.972 4928, 2048, 0, 0, 1.005 4928, 2048, 0, 1, 1.005 4928, 2093, 0, 0, 1.005 4928, 2093, 0, 1, 1.005 4928, 2048, 45, 0, 0.904 4928, 2048, 45, 1, 0.905 4928, 2093, 45, 0, 1.0 4928, 2093, 45, 1, 1.0 4928, 45, 1, 0, 1.024 4928, 45, 1, 1, 1.024 4928, 1, 45, 0, 0.913 4928, 1, 45, 1, 0.912 4928, 77, 1, 0, 0.996 4928, 77, 1, 1, 0.996 4928, 1, 77, 0, 0.925 4928, 1, 77, 1, 0.925 4928, 2093, 1, 0, 1.025 4928, 2093, 1, 1, 1.024 4928, 2049, 45, 0, 0.916 4928, 2049, 45, 1, 0.911 4992, 0, 0, 0, 1.0 4992, 0, 0, 1, 1.0 4992, 46, 0, 0, 1.009 4992, 46, 0, 1, 1.009 4992, 78, 0, 0, 0.992 4992, 78, 0, 1, 0.992 4992, 0, 46, 0, 0.908 4992, 0, 46, 1, 0.908 4992, 0, 78, 0, 0.751 4992, 0, 78, 1, 0.752 4992, 46, 46, 0, 0.997 4992, 46, 46, 1, 0.997 4992, 78, 78, 0, 0.968 4992, 78, 78, 1, 0.969 4992, 2048, 0, 0, 1.0 4992, 2048, 0, 1, 1.001 4992, 2094, 0, 0, 1.008 4992, 2094, 0, 1, 1.009 4992, 2048, 46, 0, 0.883 4992, 2048, 46, 1, 0.883 4992, 2094, 46, 0, 0.997 4992, 2094, 46, 1, 0.997 4992, 46, 1, 0, 1.025 4992, 46, 1, 1, 1.025 4992, 1, 46, 0, 0.923 4992, 1, 46, 1, 0.923 4992, 78, 1, 0, 1.0 4992, 78, 1, 1, 1.001 4992, 1, 78, 0, 0.92 4992, 1, 78, 1, 0.92 4992, 2094, 1, 0, 1.025 4992, 2094, 1, 1, 1.026 4992, 2049, 46, 0, 0.92 4992, 2049, 46, 1, 0.921 5056, 0, 0, 0, 1.002 5056, 0, 0, 1, 1.001 5056, 47, 0, 0, 1.006 5056, 47, 0, 1, 1.006 5056, 79, 0, 0, 0.99 5056, 79, 0, 1, 0.988 5056, 0, 47, 0, 0.917 5056, 0, 47, 1, 0.916 5056, 0, 79, 0, 0.771 5056, 0, 79, 1, 0.772 5056, 47, 47, 0, 1.006 5056, 47, 47, 1, 1.006 5056, 79, 79, 0, 0.972 5056, 79, 79, 1, 0.973 5056, 2048, 0, 0, 1.003 5056, 2048, 0, 1, 1.001 5056, 2095, 0, 0, 1.005 5056, 2095, 0, 1, 1.004 5056, 2048, 47, 0, 0.908 5056, 2048, 47, 1, 0.909 5056, 2095, 47, 0, 1.006 5056, 2095, 47, 1, 1.006 5056, 47, 1, 0, 1.032 5056, 47, 1, 1, 1.034 5056, 1, 47, 0, 0.926 5056, 1, 47, 1, 0.926 5056, 79, 1, 0, 1.003 5056, 79, 1, 1, 1.004 5056, 1, 79, 0, 0.927 5056, 1, 79, 1, 0.927 5056, 2095, 1, 0, 1.034 5056, 2095, 1, 1, 1.033 5056, 2049, 47, 0, 0.924 5056, 2049, 47, 1, 0.923 5120, 0, 0, 0, 1.003 5120, 0, 0, 1, 1.004 5120, 48, 0, 0, 1.068 5120, 48, 0, 1, 1.068 5120, 80, 0, 0, 1.068 5120, 80, 0, 1, 1.068 5120, 0, 48, 0, 1.065 5120, 0, 48, 1, 1.064 5120, 0, 80, 0, 1.065 5120, 0, 80, 1, 1.065 5120, 48, 48, 0, 1.004 5120, 48, 48, 1, 1.005 5120, 80, 80, 0, 1.005 5120, 80, 80, 1, 1.005 5120, 2048, 0, 0, 1.005 5120, 2048, 0, 1, 1.005 5120, 2096, 0, 0, 1.068 5120, 2096, 0, 1, 1.068 5120, 2048, 48, 0, 1.066 5120, 2048, 48, 1, 1.065 5120, 2096, 48, 0, 1.005 5120, 2096, 48, 1, 1.005 5120, 48, 1, 0, 1.032 5120, 48, 1, 1, 1.032 5120, 1, 48, 0, 0.899 5120, 1, 48, 1, 0.899 5120, 80, 1, 0, 0.844 5120, 80, 1, 1, 0.843 5120, 1, 80, 0, 0.892 5120, 1, 80, 1, 0.892 5120, 2096, 1, 0, 0.856 5120, 2096, 1, 1, 0.856 5120, 2049, 48, 0, 0.898 5120, 2049, 48, 1, 0.898 Results For: bench-memcpy-large length, align1, align2, dst > src, New Time / Old Time 65543, 0, 0, 0, 0.977 65543, 0, 0, 1, 0.976 65551, 0, 3, 0, 1.01 65551, 0, 3, 1, 1.011 65567, 3, 0, 0, 1.02 65567, 3, 0, 1, 1.02 65599, 3, 5, 0, 1.056 65599, 3, 5, 1, 1.057 65536, 0, 127, 0, 1.043 65536, 0, 127, 1, 1.043 65536, 0, 255, 0, 1.07 65536, 0, 255, 1, 1.071 65536, 0, 256, 0, 0.978 65536, 0, 256, 1, 0.979 65536, 0, 4064, 0, 1.017 65536, 0, 4064, 1, 1.018 131079, 0, 0, 0, 0.979 131079, 0, 0, 1, 0.979 131087, 0, 3, 0, 1.016 131087, 0, 3, 1, 1.016 131103, 3, 0, 0, 1.022 131103, 3, 0, 1, 1.022 131135, 3, 5, 0, 1.063 131135, 3, 5, 1, 1.063 131072, 0, 127, 0, 1.048 131072, 0, 127, 1, 1.048 131072, 0, 255, 0, 1.074 131072, 0, 255, 1, 1.074 131072, 0, 256, 0, 0.982 131072, 0, 256, 1, 0.982 131072, 0, 4064, 0, 1.018 131072, 0, 4064, 1, 1.019 262151, 0, 0, 0, 0.984 262151, 0, 0, 1, 0.984 262159, 0, 3, 0, 1.024 262159, 0, 3, 1, 1.024 262175, 3, 0, 0, 1.03 262175, 3, 0, 1, 1.03 262207, 3, 5, 0, 1.068 262207, 3, 5, 1, 1.069 262144, 0, 127, 0, 1.056 262144, 0, 127, 1, 1.056 262144, 0, 255, 0, 1.078 262144, 0, 255, 1, 1.078 262144, 0, 256, 0, 0.986 262144, 0, 256, 1, 0.986 262144, 0, 4064, 0, 1.02 262144, 0, 4064, 1, 1.02 524295, 0, 0, 0, 0.692 524295, 0, 0, 1, 0.692 524303, 0, 3, 0, 0.736 524303, 0, 3, 1, 0.736 524319, 3, 0, 0, 0.759 524319, 3, 0, 1, 0.759 524351, 3, 5, 0, 0.758 524351, 3, 5, 1, 0.759 524288, 0, 127, 0, 1.057 524288, 0, 127, 1, 1.057 524288, 0, 255, 0, 1.079 524288, 0, 255, 1, 1.079 524288, 0, 256, 0, 0.987 524288, 0, 256, 1, 0.987 524288, 0, 4064, 0, 1.02 524288, 0, 4064, 1, 1.02 1048583, 0, 0, 0, 0.948 1048583, 0, 0, 1, 0.949 1048591, 0, 3, 0, 0.734 1048591, 0, 3, 1, 0.735 1048607, 3, 0, 0, 0.758 1048607, 3, 0, 1, 0.757 1048639, 3, 5, 0, 0.757 1048639, 3, 5, 1, 0.757 1048576, 0, 127, 0, 0.761 1048576, 0, 127, 1, 0.763 1048576, 0, 255, 0, 0.751 1048576, 0, 255, 1, 0.751 1048576, 0, 256, 0, 0.93 1048576, 0, 256, 1, 0.93 1048576, 0, 4064, 0, 0.93 1048576, 0, 4064, 1, 0.93 2097159, 0, 0, 0, 0.928 2097159, 0, 0, 1, 0.931 2097167, 0, 3, 0, 0.735 2097167, 0, 3, 1, 0.734 2097183, 3, 0, 0, 0.759 2097183, 3, 0, 1, 0.76 2097215, 3, 5, 0, 0.758 2097215, 3, 5, 1, 0.757 2097152, 0, 127, 0, 0.77 2097152, 0, 127, 1, 0.77 2097152, 0, 255, 0, 0.745 2097152, 0, 255, 1, 0.745 2097152, 0, 256, 0, 0.924 2097152, 0, 256, 1, 0.925 2097152, 0, 4064, 0, 0.926 2097152, 0, 4064, 1, 0.927 4194311, 0, 0, 0, 0.886 4194311, 0, 0, 1, 0.89 4194319, 0, 3, 0, 0.746 4194319, 0, 3, 1, 0.745 4194335, 3, 0, 0, 0.816 4194335, 3, 0, 1, 0.816 4194367, 3, 5, 0, 0.78 4194367, 3, 5, 1, 0.781 4194304, 0, 127, 0, 0.792 4194304, 0, 127, 1, 0.791 4194304, 0, 255, 0, 0.803 4194304, 0, 255, 1, 0.799 4194304, 0, 256, 0, 0.865 4194304, 0, 256, 1, 0.863 4194304, 0, 4064, 0, 0.953 4194304, 0, 4064, 1, 0.95 8388615, 0, 0, 0, 0.876 8388615, 0, 0, 1, 0.877 8388623, 0, 3, 0, 0.762 8388623, 0, 3, 1, 0.762 8388639, 3, 0, 0, 0.871 8388639, 3, 0, 1, 0.87 8388671, 3, 5, 0, 0.805 8388671, 3, 5, 1, 0.808 8388608, 0, 127, 0, 0.824 8388608, 0, 127, 1, 0.823 8388608, 0, 255, 0, 0.858 8388608, 0, 255, 1, 0.857 8388608, 0, 256, 0, 0.843 8388608, 0, 256, 1, 0.84 8388608, 0, 4064, 0, 0.981 8388608, 0, 4064, 1, 0.981 16777223, 0, 0, 0, 0.881 16777223, 0, 0, 1, 0.882 16777231, 0, 3, 0, 0.765 16777231, 0, 3, 1, 0.765 16777247, 3, 0, 0, 0.87 16777247, 3, 0, 1, 0.87 16777279, 3, 5, 0, 0.807 16777279, 3, 5, 1, 0.811 16777216, 0, 127, 0, 0.827 16777216, 0, 127, 1, 0.827 16777216, 0, 255, 0, 0.858 16777216, 0, 255, 1, 0.857 16777216, 0, 256, 0, 0.848 16777216, 0, 256, 1, 0.844 16777216, 0, 4064, 0, 0.98 16777216, 0, 4064, 1, 0.981 33554439, 0, 0, 0, 0.883 33554439, 0, 0, 1, 0.884 33554447, 0, 3, 0, 0.767 33554447, 0, 3, 1, 0.766 33554463, 3, 0, 0, 0.87 33554463, 3, 0, 1, 0.87 33554495, 3, 5, 0, 0.809 33554495, 3, 5, 1, 0.813 33554432, 0, 127, 0, 0.829 33554432, 0, 127, 1, 0.829 33554432, 0, 255, 0, 0.857 33554432, 0, 255, 1, 0.857 33554432, 0, 256, 0, 0.85 33554432, 0, 256, 1, 0.846 33554432, 0, 4064, 0, 0.981 33554432, 0, 4064, 1, 0.981 Results For: bench-memcpy-random length, New Time / Old Time 32768, 0.888 65536, 0.906 131072, 0.915 262144, 0.919 524288, 0.921 1048576, 0.929 sysdeps/x86_64/multiarch/Makefile | 1 - sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ---------------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 384 ++- 3 files changed, 380 insertions(+), 3156 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 303fb5d734..e7ea963fc0 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,6 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3 \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S deleted file mode 100644 index 65644d3a09..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3151 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif -L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(80bytesormore): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - - .p2align 4 -L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - - .p2align 4 -L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_bwd): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) -L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 -L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 -L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 -L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 -L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 -L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 -L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 -L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 -L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 -L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 -L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 -L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 -L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 -L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 -L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 -L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 -L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 -L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 -L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 -L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 -L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 -L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 -L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 -L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 -L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 -L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 -L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 -L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 -L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 -L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 -L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 -L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 -L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -L(write_0bytes): - ret - - .p2align 4 -L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif -L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#endif - .p2align 4 -L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif -L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - - .p2align 3 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S index 295430b1ef..215583e7bd 100644 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S @@ -1,4 +1,380 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" +#include <sysdep.h> + +#ifndef MEMMOVE +# define MEMMOVE __memmove_ssse3 +# define MEMMOVE_CHK __memmove_chk_ssse3 +# define MEMCPY __memcpy_ssse3 +# define MEMCPY_CHK __memcpy_chk_ssse3 +# define MEMPCPY __mempcpy_ssse3 +# define MEMPCPY_CHK __mempcpy_chk_ssse3 +#endif + + .section .text.ssse3, "ax", @progbits +ENTRY(MEMPCPY_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMPCPY_CHK) + +ENTRY(MEMPCPY) + mov %RDI_LP, %RAX_LP + add %RDX_LP, %RAX_LP + jmp L(start) +END(MEMPCPY) + +ENTRY(MEMMOVE_CHK) + cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET(__chk_fail) +END(MEMMOVE_CHK) + +ENTRY_P2ALIGN(MEMMOVE, 6) + movq %rdi, %rax +L(start): + cmpq $16, %rdx + jb L(copy_0_15) + + /* These loads are always useful. */ + movups 0(%rsi), %xmm0 + movups -16(%rsi, %rdx), %xmm7 + cmpq $32, %rdx + ja L(more_2x_vec) + + movups %xmm0, 0(%rdi) + movups %xmm7, -16(%rdi, %rdx) + ret + + .p2align 4,, 4 +L(copy_0_15): + cmpl $4, %edx + jb L(copy_0_3) + cmpl $8, %edx + jb L(copy_4_7) + movq 0(%rsi), %rcx + movq -8(%rsi, %rdx), %rsi + movq %rcx, 0(%rdi) + movq %rsi, -8(%rdi, %rdx) + ret + + .p2align 4,, 4 +L(copy_4_7): + movl 0(%rsi), %ecx + movl -4(%rsi, %rdx), %esi + movl %ecx, 0(%rdi) + movl %esi, -4(%rdi, %rdx) + ret + + .p2align 4,, 4 +L(copy_0_3): + decl %edx + jl L(copy_0_0) + movb (%rsi), %cl + je L(copy_1_1) + + movzwl -1(%rsi, %rdx), %esi + movw %si, -1(%rdi, %rdx) +L(copy_1_1): + movb %cl, (%rdi) +L(copy_0_0): + ret + + .p2align 4,, 4 +L(copy_4x_vec): + movups 16(%rsi), %xmm1 + movups -32(%rsi, %rdx), %xmm2 + + movups %xmm0, 0(%rdi) + movups %xmm1, 16(%rdi) + movups %xmm2, -32(%rdi, %rdx) + movups %xmm7, -16(%rdi, %rdx) +L(nop): + ret + + .p2align 4 +L(more_2x_vec): + cmpq $64, %rdx + jbe L(copy_4x_vec) + + /* We use rcx later to get alignr value. */ + movq %rdi, %rcx + + /* Backward copy for overlap + dst > src for memmove safety. */ + subq %rsi, %rcx + cmpq %rdx, %rcx + jb L(copy_backward) + + /* Load tail. */ + + /* -16(%rsi, %rdx) already loaded into xmm7. */ + movups -32(%rsi, %rdx), %xmm8 + movups -48(%rsi, %rdx), %xmm9 + + /* Get misalignment. */ + andl $0xf, %ecx + + movq %rsi, %r9 + addq %rcx, %rsi + andq $-16, %rsi + /* Get first vec for `palignr`. */ + movaps (%rsi), %xmm1 + + /* We have loaded (%rsi) so safe to do this store before the + loop. */ + movups %xmm0, (%rdi) + +#ifdef SHARED_CACHE_SIZE_HALF + cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP +#else + cmp __x86_shared_cache_size_half(%rip), %rdx +#endif + ja L(large_memcpy) + + leaq -64(%rdi, %rdx), %r8 + andq $-16, %rdi + movl $48, %edx + + leaq L(loop_fwd_start)(%rip), %r9 + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(copy_backward): + testq %rcx, %rcx + jz L(nop) + + /* Preload tail. */ + + /* (%rsi) already loaded into xmm0. */ + movups 16(%rsi), %xmm4 + movups 32(%rsi), %xmm5 + + movq %rdi, %r8 + subq %rdi, %rsi + leaq -49(%rdi, %rdx), %rdi + andq $-16, %rdi + addq %rdi, %rsi + andq $-16, %rsi + + movaps 48(%rsi), %xmm6 + + + leaq L(loop_bkwd_start)(%rip), %r9 + andl $0xf, %ecx + sall $6, %ecx + addq %r9, %rcx + jmp * %rcx + + .p2align 4,, 8 +L(large_memcpy): + movups -64(%r9, %rdx), %xmm10 + movups -80(%r9, %rdx), %xmm11 + + sall $5, %ecx + leal (%rcx, %rcx, 2), %r8d + leaq -96(%rdi, %rdx), %rcx + andq $-16, %rdi + leaq L(large_loop_fwd_start)(%rip), %rdx + addq %r8, %rdx + jmp * %rdx + + + /* Instead of a typical jump table all 16 loops are exactly + 64-bytes in size. So, we can just jump to first loop + r8 * + 64. Before modifying any loop ensure all their sizes match! + */ + .p2align 6 +L(loop_fwd_start): +L(loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps %xmm1, 16(%rdi) + movaps %xmm2, 32(%rdi) + movaps %xmm3, 48(%rdi) + addq %rdx, %rdi + addq %rdx, %rsi + cmpq %rdi, %r8 + ja L(loop_fwd_0x0) +L(end_loop_fwd): + movups %xmm9, 16(%r8) + movups %xmm8, 32(%r8) + movups %xmm7, 48(%r8) + ret + + /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_FWD(align_by); \ + .p2align 6; \ +L(loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm4, %xmm1; \ + movaps %xmm0, 16(%rdi); \ + movaps %xmm2, 32(%rdi); \ + movaps %xmm3, 48(%rdi); \ + addq %rdx, %rdi; \ + addq %rdx, %rsi; \ + cmpq %rdi, %r8; \ + ja L(loop_fwd_ ## align_by); \ + jmp L(end_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_FWD (0xf) + ALIGNED_LOOP_FWD (0xe) + ALIGNED_LOOP_FWD (0xd) + ALIGNED_LOOP_FWD (0xc) + ALIGNED_LOOP_FWD (0xb) + ALIGNED_LOOP_FWD (0xa) + ALIGNED_LOOP_FWD (0x9) + ALIGNED_LOOP_FWD (0x8) + ALIGNED_LOOP_FWD (0x7) + ALIGNED_LOOP_FWD (0x6) + ALIGNED_LOOP_FWD (0x5) + ALIGNED_LOOP_FWD (0x4) + ALIGNED_LOOP_FWD (0x3) + ALIGNED_LOOP_FWD (0x2) + ALIGNED_LOOP_FWD (0x1) + + .p2align 6 +L(large_loop_fwd_start): +L(large_loop_fwd_0x0): + movaps 16(%rsi), %xmm1 + movaps 32(%rsi), %xmm2 + movaps 48(%rsi), %xmm3 + movaps 64(%rsi), %xmm4 + movaps 80(%rsi), %xmm5 + movntps %xmm1, 16(%rdi) + movntps %xmm2, 32(%rdi) + movntps %xmm3, 48(%rdi) + movntps %xmm4, 64(%rdi) + movntps %xmm5, 80(%rdi) + addq $80, %rdi + addq $80, %rsi + cmpq %rdi, %rcx + ja L(large_loop_fwd_0x0) + + /* Ensure no icache line split on tail. */ + .p2align 4 +L(end_large_loop_fwd): + sfence + movups %xmm11, 16(%rcx) + movups %xmm10, 32(%rcx) + movups %xmm9, 48(%rcx) + movups %xmm8, 64(%rcx) + movups %xmm7, 80(%rcx) + ret + + + /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure + 96-byte spacing between each. */ +#define ALIGNED_LARGE_LOOP_FWD(align_by); \ + .p2align 5; \ +L(large_loop_fwd_ ## align_by): \ + movaps 16(%rsi), %xmm0; \ + movaps 32(%rsi), %xmm2; \ + movaps 48(%rsi), %xmm3; \ + movaps 64(%rsi), %xmm4; \ + movaps 80(%rsi), %xmm5; \ + movaps %xmm5, %xmm6; \ + palignr $align_by, %xmm4, %xmm5; \ + palignr $align_by, %xmm3, %xmm4; \ + palignr $align_by, %xmm2, %xmm3; \ + palignr $align_by, %xmm0, %xmm2; \ + palignr $align_by, %xmm1, %xmm0; \ + movaps %xmm6, %xmm1; \ + movntps %xmm0, 16(%rdi); \ + movntps %xmm2, 32(%rdi); \ + movntps %xmm3, 48(%rdi); \ + movntps %xmm4, 64(%rdi); \ + movntps %xmm5, 80(%rdi); \ + addq $80, %rdi; \ + addq $80, %rsi; \ + cmpq %rdi, %rcx; \ + ja L(large_loop_fwd_ ## align_by); \ + jmp L(end_large_loop_fwd); + + /* Must be in descending order. */ + ALIGNED_LARGE_LOOP_FWD (0xf) + ALIGNED_LARGE_LOOP_FWD (0xe) + ALIGNED_LARGE_LOOP_FWD (0xd) + ALIGNED_LARGE_LOOP_FWD (0xc) + ALIGNED_LARGE_LOOP_FWD (0xb) + ALIGNED_LARGE_LOOP_FWD (0xa) + ALIGNED_LARGE_LOOP_FWD (0x9) + ALIGNED_LARGE_LOOP_FWD (0x8) + ALIGNED_LARGE_LOOP_FWD (0x7) + ALIGNED_LARGE_LOOP_FWD (0x6) + ALIGNED_LARGE_LOOP_FWD (0x5) + ALIGNED_LARGE_LOOP_FWD (0x4) + ALIGNED_LARGE_LOOP_FWD (0x3) + ALIGNED_LARGE_LOOP_FWD (0x2) + ALIGNED_LARGE_LOOP_FWD (0x1) + + + .p2align 6 +L(loop_bkwd_start): +L(loop_bkwd_0x0): + movaps 32(%rsi), %xmm1 + movaps 16(%rsi), %xmm2 + movaps 0(%rsi), %xmm3 + movaps %xmm1, 32(%rdi) + movaps %xmm2, 16(%rdi) + movaps %xmm3, 0(%rdi) + subq $48, %rdi + subq $48, %rsi + cmpq %rdi, %r8 + jb L(loop_bkwd_0x0) +L(end_loop_bkwd): + movups %xmm7, -16(%r8, %rdx) + movups %xmm0, 0(%r8) + movups %xmm4, 16(%r8) + movups %xmm5, 32(%r8) + + ret + + + /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding. + 60 bytes otherwise. */ +#define ALIGNED_LOOP_BKWD(align_by); \ + .p2align 6; \ +L(loop_bkwd_ ## align_by): \ + movaps 32(%rsi), %xmm1; \ + movaps 16(%rsi), %xmm2; \ + movaps 0(%rsi), %xmm3; \ + palignr $align_by, %xmm1, %xmm6; \ + palignr $align_by, %xmm2, %xmm1; \ + palignr $align_by, %xmm3, %xmm2; \ + movaps %xmm6, 32(%rdi); \ + movaps %xmm1, 16(%rdi); \ + movaps %xmm2, 0(%rdi); \ + subq $48, %rdi; \ + subq $48, %rsi; \ + movaps %xmm3, %xmm6; \ + cmpq %rdi, %r8; \ + jb L(loop_bkwd_ ## align_by); \ + jmp L(end_loop_bkwd); + + /* Must be in descending order. */ + ALIGNED_LOOP_BKWD (0xf) + ALIGNED_LOOP_BKWD (0xe) + ALIGNED_LOOP_BKWD (0xd) + ALIGNED_LOOP_BKWD (0xc) + ALIGNED_LOOP_BKWD (0xb) + ALIGNED_LOOP_BKWD (0xa) + ALIGNED_LOOP_BKWD (0x9) + ALIGNED_LOOP_BKWD (0x8) + ALIGNED_LOOP_BKWD (0x7) + ALIGNED_LOOP_BKWD (0x6) + ALIGNED_LOOP_BKWD (0x5) + ALIGNED_LOOP_BKWD (0x4) + ALIGNED_LOOP_BKWD (0x3) + ALIGNED_LOOP_BKWD (0x2) + ALIGNED_LOOP_BKWD (0x1) +END(MEMMOVE) + +strong_alias (MEMMOVE, MEMCPY) +strong_alias (MEMMOVE_CHK, MEMCPY_CHK) -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (4 preceding siblings ...) 2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein @ 2022-04-14 18:04 ` H.J. Lu 5 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-04-14 18:04 UTC (permalink / raw) To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result it is no longer worth it to keep the SSSE3 > versions given the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - > sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- > sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - > 5 files changed, 2006 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 6507d1b7fa..51222dfab1 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -12,7 +12,6 @@ sysdep_routines += \ > memcmp-evex-movbe \ > memcmp-sse2 \ > memcmp-sse4 \ > - memcmp-ssse3 \ > memcmpeq-avx2 \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > @@ -179,7 +178,6 @@ sysdep_routines += \ > wmemcmp-c \ > wmemcmp-evex-movbe \ > wmemcmp-sse4 \ > - wmemcmp-ssse3 \ > # sysdep_routines > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 40cc6cc49e..f389928a4e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), > __memcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), > - __memcmp_ssse3) > IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > > #ifdef SHARED > @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __wmemcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), > __wmemcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), > - __wmemcmp_ssse3) > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemset.c. */ > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > index cd12613699..44759a3ad5 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > @@ -20,7 +20,6 @@ > # include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; > @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > return OPTIMIZE (sse4_1); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S > deleted file mode 100644 > index df1b1fc494..0000000000 > --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S > +++ /dev/null > @@ -1,1992 +0,0 @@ > -/* memcmp with SSSE3, wmemcmp with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > - > -# ifndef MEMCMP > -# define MEMCMP __memcmp_ssse3 > -# endif > - > -/* Warning! > - wmemcmp has to use SIGNED comparison for elements. > - memcmp has to use UNSIGNED comparison for elemnts. > -*/ > - > - atom_text_section > -ENTRY (MEMCMP) > -# ifdef USE_AS_WMEMCMP > - shl $2, %RDX_LP > - test %RDX_LP, %RDX_LP > - jz L(equal) > -# elif defined __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -# endif > - mov %rdx, %rcx > - mov %rdi, %rdx > - cmp $48, %rcx; > - jae L(48bytesormore) /* LEN => 48 */ > - > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -/* ECX >= 32. */ > -L(48bytesormore): > - movdqu (%rdi), %xmm3 > - movdqu (%rsi), %xmm0 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %edx > - lea 16(%rdi), %rdi > - lea 16(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(less16bytes) > - mov %edi, %edx > - and $0xf, %edx > - xor %rdx, %rdi > - sub %rdx, %rsi > - add %rdx, %rcx > - mov %esi, %edx > - and $0xf, %edx > - jz L(shr_0) > - xor %rdx, %rsi > - > -# ifndef USE_AS_WMEMCMP > - cmp $8, %edx > - jae L(next_unaligned_table) > - cmp $0, %edx > - je L(shr_0) > - cmp $1, %edx > - je L(shr_1) > - cmp $2, %edx > - je L(shr_2) > - cmp $3, %edx > - je L(shr_3) > - cmp $4, %edx > - je L(shr_4) > - cmp $5, %edx > - je L(shr_5) > - cmp $6, %edx > - je L(shr_6) > - jmp L(shr_7) > - > - .p2align 2 > -L(next_unaligned_table): > - cmp $8, %edx > - je L(shr_8) > - cmp $9, %edx > - je L(shr_9) > - cmp $10, %edx > - je L(shr_10) > - cmp $11, %edx > - je L(shr_11) > - cmp $12, %edx > - je L(shr_12) > - cmp $13, %edx > - je L(shr_13) > - cmp $14, %edx > - je L(shr_14) > - jmp L(shr_15) > -# else > - cmp $0, %edx > - je L(shr_0) > - cmp $4, %edx > - je L(shr_4) > - cmp $8, %edx > - je L(shr_8) > - jmp L(shr_12) > -# endif > - > - .p2align 4 > -L(shr_0): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - jae L(shr_0_gobble) > - xor %eax, %eax > - movdqa (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > - pand %xmm1, %xmm2 > - pmovmskb %xmm2, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_0_gobble): > - movdqa (%rsi), %xmm0 > - xor %eax, %eax > - pcmpeqb (%rdi), %xmm0 > - sub $32, %rcx > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > -L(shr_0_gobble_loop): > - pand %xmm0, %xmm2 > - sub $32, %rcx > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - movdqa 32(%rsi), %xmm0 > - movdqa 48(%rsi), %xmm2 > - sbb $0xffff, %edx > - pcmpeqb 32(%rdi), %xmm0 > - pcmpeqb 48(%rdi), %xmm2 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - jz L(shr_0_gobble_loop) > - > - pand %xmm0, %xmm2 > - cmp $0, %rcx > - jge L(next) > - inc %edx > - add $32, %rcx > -L(next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_1): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_1_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $1, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $1, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_1_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $1, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_1_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $1, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $1, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_1_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_1_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_1_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 1(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - > - .p2align 4 > -L(shr_2): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_2_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $2, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $2, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_2_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $2, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_2_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $2, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $2, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_2_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_2_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_2_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 2(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_3_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $3, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $3, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $3, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_3_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $3, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $3, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_3_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_3_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_3_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 3(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_4): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_4_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $4, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $4, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_4_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $4, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_4_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $4, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $4, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_4_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_4_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_4_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 4(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_5): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_5_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $5, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $5, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_5_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $5, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_5_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $5, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $5, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_5_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_5_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_5_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 5(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_6_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $6, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $6, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $6, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_6_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $6, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $6, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_6_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_6_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_6_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 6(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_7_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $7, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $7, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $7, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_7_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $7, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $7, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_7_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_7_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_7_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 7(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_8): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_8_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $8, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $8, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_8_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $8, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_8_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $8, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $8, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_8_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_8_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_8_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 8(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_9): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_9_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $9, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $9, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_9_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $9, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_9_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $9, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $9, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_9_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_9_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_9_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 9(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_10_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $10, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $10, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $10, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_10_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $10, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $10, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_10_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_10_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_10_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 10(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_11_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $11, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $11, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $11, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_11_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $11, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $11, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_11_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_11_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_11_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 11(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_12): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_12_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $12, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $12, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_12_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $12, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_12_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $12, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $12, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_12_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_12_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_12_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 12(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_13): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_13_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $13, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $13, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_13_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $13, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_13_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $13, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $13, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_13_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_13_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_13_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 13(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_14_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $14, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $14, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $14, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_14_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $14, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $14, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_14_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_14_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_14_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 14(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_15_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $15, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $15, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $15, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_15_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $15, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $15, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_15_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_15_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_15_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 15(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > -# endif > - .p2align 4 > -L(exit): > - pmovmskb %xmm1, %r8d > - sub $0xffff, %r8d > - jz L(first16bytes) > - lea -16(%rsi), %rsi > - lea -16(%rdi), %rdi > - mov %r8d, %edx > -L(first16bytes): > - add %rax, %rsi > -L(less16bytes): > -# ifndef USE_AS_WMEMCMP > - test %dl, %dl > - jz L(next_24_bytes) > - > - test $0x01, %dl > - jnz L(Byte16) > - > - test $0x02, %dl > - jnz L(Byte17) > - > - test $0x04, %dl > - jnz L(Byte18) > - > - test $0x08, %dl > - jnz L(Byte19) > - > - test $0x10, %dl > - jnz L(Byte20) > - > - test $0x20, %dl > - jnz L(Byte21) > - > - test $0x40, %dl > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte16): > - movzbl -16(%rdi), %eax > - movzbl -16(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte17): > - movzbl -15(%rdi), %eax > - movzbl -15(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte18): > - movzbl -14(%rdi), %eax > - movzbl -14(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte19): > - movzbl -13(%rdi), %eax > - movzbl -13(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte20): > - movzbl -12(%rdi), %eax > - movzbl -12(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte21): > - movzbl -11(%rdi), %eax > - movzbl -11(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte22): > - movzbl -10(%rdi), %eax > - movzbl -10(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(next_24_bytes): > - lea 8(%rdi), %rdi > - lea 8(%rsi), %rsi > - test $0x01, %dh > - jnz L(Byte16) > - > - test $0x02, %dh > - jnz L(Byte17) > - > - test $0x04, %dh > - jnz L(Byte18) > - > - test $0x08, %dh > - jnz L(Byte19) > - > - test $0x10, %dh > - jnz L(Byte20) > - > - test $0x20, %dh > - jnz L(Byte21) > - > - test $0x40, %dh > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > -# else > -/* special for wmemcmp */ > - xor %eax, %eax > - test %dl, %dl > - jz L(next_two_double_words) > - and $15, %dl > - jz L(second_double_word) > - mov -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(second_double_word): > - mov -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(next_two_double_words): > - and $15, %dh > - jz L(fourth_double_word) > - mov -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(fourth_double_word): > - mov -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > - ret > -# endif > - > - .p2align 4 > -L(less48bytes): > - cmp $8, %ecx > - jae L(more8bytes) > - cmp $0, %ecx > - je L(0bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $1, %ecx > - je L(1bytes) > - cmp $2, %ecx > - je L(2bytes) > - cmp $3, %ecx > - je L(3bytes) > - cmp $4, %ecx > - je L(4bytes) > - cmp $5, %ecx > - je L(5bytes) > - cmp $6, %ecx > - je L(6bytes) > - jmp L(7bytes) > -# else > - jmp L(4bytes) > -# endif > - > - .p2align 4 > -L(more8bytes): > - cmp $16, %ecx > - jae L(more16bytes) > - cmp $8, %ecx > - je L(8bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $9, %ecx > - je L(9bytes) > - cmp $10, %ecx > - je L(10bytes) > - cmp $11, %ecx > - je L(11bytes) > - cmp $12, %ecx > - je L(12bytes) > - cmp $13, %ecx > - je L(13bytes) > - cmp $14, %ecx > - je L(14bytes) > - jmp L(15bytes) > -# else > - jmp L(12bytes) > -# endif > - > - .p2align 4 > -L(more16bytes): > - cmp $24, %ecx > - jae L(more24bytes) > - cmp $16, %ecx > - je L(16bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $17, %ecx > - je L(17bytes) > - cmp $18, %ecx > - je L(18bytes) > - cmp $19, %ecx > - je L(19bytes) > - cmp $20, %ecx > - je L(20bytes) > - cmp $21, %ecx > - je L(21bytes) > - cmp $22, %ecx > - je L(22bytes) > - jmp L(23bytes) > -# else > - jmp L(20bytes) > -# endif > - > - .p2align 4 > -L(more24bytes): > - cmp $32, %ecx > - jae L(more32bytes) > - cmp $24, %ecx > - je L(24bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $25, %ecx > - je L(25bytes) > - cmp $26, %ecx > - je L(26bytes) > - cmp $27, %ecx > - je L(27bytes) > - cmp $28, %ecx > - je L(28bytes) > - cmp $29, %ecx > - je L(29bytes) > - cmp $30, %ecx > - je L(30bytes) > - jmp L(31bytes) > -# else > - jmp L(28bytes) > -# endif > - > - .p2align 4 > -L(more32bytes): > - cmp $40, %ecx > - jae L(more40bytes) > - cmp $32, %ecx > - je L(32bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $33, %ecx > - je L(33bytes) > - cmp $34, %ecx > - je L(34bytes) > - cmp $35, %ecx > - je L(35bytes) > - cmp $36, %ecx > - je L(36bytes) > - cmp $37, %ecx > - je L(37bytes) > - cmp $38, %ecx > - je L(38bytes) > - jmp L(39bytes) > -# else > - jmp L(36bytes) > -# endif > - > - .p2align 4 > -L(more40bytes): > - cmp $40, %ecx > - je L(40bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $41, %ecx > - je L(41bytes) > - cmp $42, %ecx > - je L(42bytes) > - cmp $43, %ecx > - je L(43bytes) > - cmp $44, %ecx > - je L(44bytes) > - cmp $45, %ecx > - je L(45bytes) > - cmp $46, %ecx > - je L(46bytes) > - jmp L(47bytes) > - > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - movl -44(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - movl -40(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - movl -36(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - movl -32(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - movl -28(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - movl -24(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - movl -20(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - movl -16(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - movl -12(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - movl -8(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - movl -4(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# else > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - cmp -44(%rsi), %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - cmp -40(%rsi), %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - cmp -36(%rsi), %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - cmp -32(%rsi), %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - cmp -28(%rsi), %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - cmp -24(%rsi), %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - cmp -20(%rsi), %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# endif > - > -# ifndef USE_AS_WMEMCMP > - .p2align 4 > -L(45bytes): > - movl -45(%rdi), %eax > - movl -45(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(41bytes): > - movl -41(%rdi), %eax > - movl -41(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(37bytes): > - movl -37(%rdi), %eax > - movl -37(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(33bytes): > - movl -33(%rdi), %eax > - movl -33(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(29bytes): > - movl -29(%rdi), %eax > - movl -29(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(25bytes): > - movl -25(%rdi), %eax > - movl -25(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(21bytes): > - movl -21(%rdi), %eax > - movl -21(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(17bytes): > - movl -17(%rdi), %eax > - movl -17(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(13bytes): > - movl -13(%rdi), %eax > - movl -13(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(9bytes): > - movl -9(%rdi), %eax > - movl -9(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(5bytes): > - movl -5(%rdi), %eax > - movl -5(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(1bytes): > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(46bytes): > - movl -46(%rdi), %eax > - movl -46(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(42bytes): > - movl -42(%rdi), %eax > - movl -42(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(38bytes): > - movl -38(%rdi), %eax > - movl -38(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(34bytes): > - movl -34(%rdi), %eax > - movl -34(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(30bytes): > - movl -30(%rdi), %eax > - movl -30(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(26bytes): > - movl -26(%rdi), %eax > - movl -26(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(22bytes): > - movl -22(%rdi), %eax > - movl -22(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(18bytes): > - movl -18(%rdi), %eax > - movl -18(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(14bytes): > - movl -14(%rdi), %eax > - movl -14(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(10bytes): > - movl -10(%rdi), %eax > - movl -10(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(6bytes): > - movl -6(%rdi), %eax > - movl -6(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(2bytes): > - movzwl -2(%rdi), %eax > - movzwl -2(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(47bytes): > - movl -47(%rdi), %eax > - movl -47(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(43bytes): > - movl -43(%rdi), %eax > - movl -43(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(39bytes): > - movl -39(%rdi), %eax > - movl -39(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(35bytes): > - movl -35(%rdi), %eax > - movl -35(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(31bytes): > - movl -31(%rdi), %eax > - movl -31(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(27bytes): > - movl -27(%rdi), %eax > - movl -27(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(23bytes): > - movl -23(%rdi), %eax > - movl -23(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(19bytes): > - movl -19(%rdi), %eax > - movl -19(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(15bytes): > - movl -15(%rdi), %eax > - movl -15(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(11bytes): > - movl -11(%rdi), %eax > - movl -11(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(7bytes): > - movl -7(%rdi), %eax > - movl -7(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(3bytes): > - movzwl -3(%rdi), %eax > - movzwl -3(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(find_diff): > - cmpb %cl, %al > - jne L(set) > - cmpw %cx, %ax > - jne L(set) > - shr $16, %eax > - shr $16, %ecx > - cmpb %cl, %al > - jne L(set) > - > -/* We get there only if we already know there is a > -difference. */ > - > - cmp %ecx, %eax > -L(set): > - sbb %eax, %eax > - sbb $-1, %eax > - ret > -# else > - > -/* for wmemcmp */ > - .p2align 4 > -L(find_diff): > - mov $1, %eax > - jg L(find_diff_bigger) > - neg %eax > - ret > - > - .p2align 4 > -L(find_diff_bigger): > - ret > -# endif > - > - .p2align 4 > -L(equal): > - xor %eax, %eax > - ret > - > -END (MEMCMP) > -#endif > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > deleted file mode 100644 > index a41ef95fc1..0000000000 > --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_WMEMCMP 1 > -#define MEMCMP __wmemcmp_ssse3 > - > -#include "memcmp-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein @ 2022-03-25 18:36 ` Noah Goldstein 2022-03-25 19:56 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein ` (4 subsequent siblings) 6 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +- sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 -------------------- sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 - 5 files changed, 7 insertions(+), 3183 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index ed2def288d..48f81711ae 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,7 +16,6 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3 \ memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ @@ -24,7 +23,6 @@ sysdep_routines += \ memmove-avx512-unaligned-erms \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ - memmove-ssse3 \ memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 7e2be3554b..70b0e9c62e 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (SSSE3), __memmove_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, @@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memmove_avx512_unaligned_erms) IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), __memmove_ssse3_back) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned) @@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (SSSE3), __memcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, @@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __memcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), __memcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512F), __memcpy_avx512_no_vzeroupper) @@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (SSSE3), __mempcpy_chk_ssse3_back) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, @@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, __mempcpy_evex_unaligned_erms) IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), __mempcpy_ssse3_back) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index f8f958064c..1ecdd4b0d3 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) attribute_hidden; @@ -94,17 +92,15 @@ IFUNC_SELECTOR (void) } } - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) { - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) - return OPTIMIZE (sse2_unaligned_erms); - - return OPTIMIZE (sse2_unaligned); + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) + return OPTIMIZE (ssse3_back); } - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (sse2_unaligned_erms); - return OPTIMIZE (ssse3); + return OPTIMIZE (sse2_unaligned); } diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S deleted file mode 100644 index 65644d3a09..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S +++ /dev/null @@ -1,3151 +0,0 @@ -/* memcpy with SSSE3 - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3 -# define MEMCPY_CHK __memcpy_chk_ssse3 -# define MEMPCPY __mempcpy_ssse3 -# define MEMPCPY_CHK __mempcpy_chk_ssse3 -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(write_0bytes) - cmp $79, %rdx - jbe L(copy_forward) - jmp L(copy_backward) -L(copy_forward): -#endif -L(start): - cmp $79, %rdx - lea L(table_less_80bytes)(%rip), %r11 - ja L(80bytesormore) - movslq (%r11, %rdx, 4), %r9 - add %rdx, %rsi - add %rdx, %rdi - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(80bytesormore): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - - movdqu (%rsi), %xmm0 - mov %rdi, %rcx - and $-16, %rdi - add $16, %rdi - mov %rcx, %r8 - sub %rdi, %rcx - add %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_fwd) - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) - - .p2align 4 -L(copy_backward): - movdqu -16(%rsi, %rdx), %xmm0 - add %rdx, %rsi - lea -16(%rdi, %rdx), %r8 - add %rdx, %rdi - - mov %rdi, %rcx - and $0xf, %rcx - xor %rcx, %rdi - sub %rcx, %rdx - sub %rcx, %rsi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif - - cmp %rcx, %rdx - mov %rsi, %r9 - ja L(large_page_bwd) - and $0xf, %r9 - jz L(shl_0_bwd) -#ifdef DATA_CACHE_SIZE_HALF - mov $DATA_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_data_cache_size_half(%rip), %RCX_LP -#endif - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) - - .p2align 4 -L(shl_0): - sub $16, %rdx - movdqa (%rsi), %xmm1 - add $16, %rsi - movdqa %xmm1, (%rdi) - add $16, %rdi - cmp $128, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble) - cmp $64, %rdx - jb L(shl_0_less_64bytes) - movaps (%rsi), %xmm4 - movaps 16(%rsi), %xmm1 - movaps 32(%rsi), %xmm2 - movaps 48(%rsi), %xmm3 - movaps %xmm4, (%rdi) - movaps %xmm1, 16(%rdi) - movaps %xmm2, 32(%rdi) - movaps %xmm3, 48(%rdi) - sub $64, %rdx - add $64, %rsi - add $64, %rdi -L(shl_0_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_loop) -L(shl_0_gobble_cache_loop): - movdqa (%rsi), %xmm4 - movaps 0x10(%rsi), %xmm1 - movaps 0x20(%rsi), %xmm2 - movaps 0x30(%rsi), %xmm3 - - movdqa %xmm4, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - - sub $128, %rdx - movaps 0x40(%rsi), %xmm4 - movaps 0x50(%rsi), %xmm5 - movaps 0x60(%rsi), %xmm6 - movaps 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_cache_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_cache_less_64bytes) - - movdqa (%rsi), %xmm4 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm4, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm4 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm4, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_cache_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_loop): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x280(%rsi) - - movdqa (%rsi), %xmm0 - movdqa 0x10(%rsi), %xmm1 - movdqa 0x20(%rsi), %xmm2 - movdqa 0x30(%rsi), %xmm3 - movdqa 0x40(%rsi), %xmm4 - movdqa 0x50(%rsi), %xmm5 - movdqa 0x60(%rsi), %xmm6 - movdqa 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_less_64bytes) - - movdqa (%rsi), %xmm0 - sub $0x40, %rdx - movdqa 0x10(%rsi), %xmm1 - - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - - movdqa 0x20(%rsi), %xmm0 - movdqa 0x30(%rsi), %xmm1 - add $0x40, %rsi - - movdqa %xmm0, 0x20(%rdi) - movdqa %xmm1, 0x30(%rdi) - add $0x40, %rdi -L(shl_0_mem_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_less_32bytes) - movdqa (%rsi), %xmm0 - sub $0x20, %rdx - movdqa 0x10(%rsi), %xmm1 - add $0x20, %rsi - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - add $0x20, %rdi -L(shl_0_mem_less_32bytes): - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $16, %rdx - movdqa -0x10(%rsi), %xmm1 - sub $16, %rsi - movdqa %xmm1, -0x10(%rdi) - sub $16, %rdi - cmp $0x80, %rdx - movdqu %xmm0, (%r8) - ja L(shl_0_gobble_bwd) - cmp $64, %rdx - jb L(shl_0_less_64bytes_bwd) - movaps -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - sub $64, %rdx - sub $0x40, %rsi - sub $0x40, %rdi -L(shl_0_less_64bytes_bwd): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_bwd): -#ifdef DATA_CACHE_SIZE_HALF - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP -#else - cmp __x86_data_cache_size_half(%rip), %RDX_LP -#endif - lea -128(%rdx), %rdx - jae L(shl_0_gobble_mem_bwd_loop) -L(shl_0_gobble_bwd_loop): - movdqa -0x10(%rsi), %xmm0 - movaps -0x20(%rsi), %xmm1 - movaps -0x30(%rsi), %xmm2 - movaps -0x40(%rsi), %xmm3 - - movdqa %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - - sub $0x80, %rdx - movaps -0x50(%rsi), %xmm4 - movaps -0x60(%rsi), %xmm5 - movaps -0x70(%rsi), %xmm6 - movaps -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_gobble_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_gobble_bwd_less_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_0_gobble_mem_bwd_loop): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x280(%rsi) - movdqa -0x10(%rsi), %xmm0 - movdqa -0x20(%rsi), %xmm1 - movdqa -0x30(%rsi), %xmm2 - movdqa -0x40(%rsi), %xmm3 - movdqa -0x50(%rsi), %xmm4 - movdqa -0x60(%rsi), %xmm5 - movdqa -0x70(%rsi), %xmm6 - movdqa -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - sub $0x80, %rdx - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - movdqa %xmm2, -0x30(%rdi) - movdqa %xmm3, -0x40(%rdi) - movdqa %xmm4, -0x50(%rdi) - movdqa %xmm5, -0x60(%rdi) - movdqa %xmm6, -0x70(%rdi) - movdqa %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - - jae L(shl_0_gobble_mem_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(shl_0_mem_bwd_less_64bytes) - - movdqa -0x10(%rsi), %xmm0 - sub $0x40, %rdx - movdqa -0x20(%rsi), %xmm1 - - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - - movdqa -0x30(%rsi), %xmm0 - movdqa -0x40(%rsi), %xmm1 - sub $0x40, %rsi - - movdqa %xmm0, -0x30(%rdi) - movdqa %xmm1, -0x40(%rdi) - sub $0x40, %rdi -L(shl_0_mem_bwd_less_64bytes): - cmp $0x20, %rdx - jb L(shl_0_mem_bwd_less_32bytes) - movdqa -0x10(%rsi), %xmm0 - sub $0x20, %rdx - movdqa -0x20(%rsi), %xmm1 - sub $0x20, %rsi - movdqa %xmm0, -0x10(%rdi) - movdqa %xmm1, -0x20(%rdi) - sub $0x20, %rdi -L(shl_0_mem_bwd_less_32bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1): - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_fwd) - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 -L(L1_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_1_loop_L1): - sub $64, %rdx - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $1, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $1, %xmm3, %xmm4 - palignr $1, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $1, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_1_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x01(%rsi), %xmm1 - jb L(L1_bwd) - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 -L(L1_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_1_bwd_loop_L1): - movaps -0x11(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x21(%rsi), %xmm3 - movaps -0x31(%rsi), %xmm4 - movaps -0x41(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $1, %xmm2, %xmm1 - palignr $1, %xmm3, %xmm2 - palignr $1, %xmm4, %xmm3 - palignr $1, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_1_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_1_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2): - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_fwd) - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 -L(L2_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_2_loop_L1): - sub $64, %rdx - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $2, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $2, %xmm3, %xmm4 - palignr $2, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $2, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_2_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x02(%rsi), %xmm1 - jb L(L2_bwd) - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 -L(L2_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_2_bwd_loop_L1): - movaps -0x12(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x22(%rsi), %xmm3 - movaps -0x32(%rsi), %xmm4 - movaps -0x42(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $2, %xmm2, %xmm1 - palignr $2, %xmm3, %xmm2 - palignr $2, %xmm4, %xmm3 - palignr $2, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_2_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_2_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3): - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_fwd) - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 -L(L3_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_3_loop_L1): - sub $64, %rdx - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $3, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $3, %xmm3, %xmm4 - palignr $3, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $3, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_3_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x03(%rsi), %xmm1 - jb L(L3_bwd) - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 -L(L3_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_3_bwd_loop_L1): - movaps -0x13(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x23(%rsi), %xmm3 - movaps -0x33(%rsi), %xmm4 - movaps -0x43(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $3, %xmm2, %xmm1 - palignr $3, %xmm3, %xmm2 - palignr $3, %xmm4, %xmm3 - palignr $3, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_3_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_3_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4): - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_fwd) - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 -L(L4_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_4_loop_L1): - sub $64, %rdx - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $4, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $4, %xmm3, %xmm4 - palignr $4, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $4, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_4_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x04(%rsi), %xmm1 - jb L(L4_bwd) - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 -L(L4_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_4_bwd_loop_L1): - movaps -0x14(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x24(%rsi), %xmm3 - movaps -0x34(%rsi), %xmm4 - movaps -0x44(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $4, %xmm2, %xmm1 - palignr $4, %xmm3, %xmm2 - palignr $4, %xmm4, %xmm3 - palignr $4, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_4_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_4_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5): - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_fwd) - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 -L(L5_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_5_loop_L1): - sub $64, %rdx - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $5, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $5, %xmm3, %xmm4 - palignr $5, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $5, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_5_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x05(%rsi), %xmm1 - jb L(L5_bwd) - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 -L(L5_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_5_bwd_loop_L1): - movaps -0x15(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x25(%rsi), %xmm3 - movaps -0x35(%rsi), %xmm4 - movaps -0x45(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $5, %xmm2, %xmm1 - palignr $5, %xmm3, %xmm2 - palignr $5, %xmm4, %xmm3 - palignr $5, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_5_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_5_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6): - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_fwd) - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 -L(L6_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_6_loop_L1): - sub $64, %rdx - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $6, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $6, %xmm3, %xmm4 - palignr $6, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $6, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_6_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x06(%rsi), %xmm1 - jb L(L6_bwd) - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 -L(L6_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_6_bwd_loop_L1): - movaps -0x16(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x26(%rsi), %xmm3 - movaps -0x36(%rsi), %xmm4 - movaps -0x46(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $6, %xmm2, %xmm1 - palignr $6, %xmm3, %xmm2 - palignr $6, %xmm4, %xmm3 - palignr $6, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_6_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_6_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7): - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_fwd) - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 -L(L7_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_7_loop_L1): - sub $64, %rdx - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $7, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $7, %xmm3, %xmm4 - palignr $7, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $7, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_7_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x07(%rsi), %xmm1 - jb L(L7_bwd) - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 -L(L7_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_7_bwd_loop_L1): - movaps -0x17(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x27(%rsi), %xmm3 - movaps -0x37(%rsi), %xmm4 - movaps -0x47(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $7, %xmm2, %xmm1 - palignr $7, %xmm3, %xmm2 - palignr $7, %xmm4, %xmm3 - palignr $7, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_7_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_7_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8): - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_fwd) - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 -L(L8_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 -L(shl_8_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_8_loop_L1): - sub $64, %rdx - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $8, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $8, %xmm3, %xmm4 - palignr $8, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $8, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_8_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 - .p2align 4 -L(shl_8_end): - lea 64(%rdx), %rdx - movaps %xmm4, -0x20(%rdi) - add %rdx, %rsi - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x08(%rsi), %xmm1 - jb L(L8_bwd) - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 -L(L8_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_8_bwd_loop_L1): - movaps -0x18(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x28(%rsi), %xmm3 - movaps -0x38(%rsi), %xmm4 - movaps -0x48(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $8, %xmm2, %xmm1 - palignr $8, %xmm3, %xmm2 - palignr $8, %xmm4, %xmm3 - palignr $8, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_8_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_8_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9): - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_fwd) - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 -L(L9_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_9_loop_L1): - sub $64, %rdx - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $9, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $9, %xmm3, %xmm4 - palignr $9, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $9, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_9_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x09(%rsi), %xmm1 - jb L(L9_bwd) - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 -L(L9_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_9_bwd_loop_L1): - movaps -0x19(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x29(%rsi), %xmm3 - movaps -0x39(%rsi), %xmm4 - movaps -0x49(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $9, %xmm2, %xmm1 - palignr $9, %xmm3, %xmm2 - palignr $9, %xmm4, %xmm3 - palignr $9, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_9_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_9_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10): - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_fwd) - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 -L(L10_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_10_loop_L1): - sub $64, %rdx - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $10, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $10, %xmm3, %xmm4 - palignr $10, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $10, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_10_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0a(%rsi), %xmm1 - jb L(L10_bwd) - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 -L(L10_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_10_bwd_loop_L1): - movaps -0x1a(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2a(%rsi), %xmm3 - movaps -0x3a(%rsi), %xmm4 - movaps -0x4a(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $10, %xmm2, %xmm1 - palignr $10, %xmm3, %xmm2 - palignr $10, %xmm4, %xmm3 - palignr $10, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_10_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_10_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11): - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_fwd) - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 -L(L11_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_11_loop_L1): - sub $64, %rdx - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $11, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $11, %xmm3, %xmm4 - palignr $11, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $11, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_11_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0b(%rsi), %xmm1 - jb L(L11_bwd) - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 -L(L11_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_11_bwd_loop_L1): - movaps -0x1b(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2b(%rsi), %xmm3 - movaps -0x3b(%rsi), %xmm4 - movaps -0x4b(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $11, %xmm2, %xmm1 - palignr $11, %xmm3, %xmm2 - palignr $11, %xmm4, %xmm3 - palignr $11, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_11_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_11_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12): - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_fwd) - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 -L(L12_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_12_loop_L1): - sub $64, %rdx - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $12, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $12, %xmm3, %xmm4 - palignr $12, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $12, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_12_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0c(%rsi), %xmm1 - jb L(L12_bwd) - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 -L(L12_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_12_bwd_loop_L1): - movaps -0x1c(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2c(%rsi), %xmm3 - movaps -0x3c(%rsi), %xmm4 - movaps -0x4c(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $12, %xmm2, %xmm1 - palignr $12, %xmm3, %xmm2 - palignr $12, %xmm4, %xmm3 - palignr $12, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_12_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_12_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13): - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_fwd) - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 -L(L13_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_13_loop_L1): - sub $64, %rdx - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $13, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $13, %xmm3, %xmm4 - palignr $13, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $13, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_13_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0d(%rsi), %xmm1 - jb L(L13_bwd) - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 -L(L13_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_13_bwd_loop_L1): - movaps -0x1d(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2d(%rsi), %xmm3 - movaps -0x3d(%rsi), %xmm4 - movaps -0x4d(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $13, %xmm2, %xmm1 - palignr $13, %xmm3, %xmm2 - palignr $13, %xmm4, %xmm3 - palignr $13, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_13_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_13_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14): - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_fwd) - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 -L(L14_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_14_loop_L1): - sub $64, %rdx - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $14, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $14, %xmm3, %xmm4 - palignr $14, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $14, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_14_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0e(%rsi), %xmm1 - jb L(L14_bwd) - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 -L(L14_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_14_bwd_loop_L1): - movaps -0x1e(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2e(%rsi), %xmm3 - movaps -0x3e(%rsi), %xmm4 - movaps -0x4e(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $14, %xmm2, %xmm1 - palignr $14, %xmm3, %xmm2 - palignr $14, %xmm4, %xmm3 - palignr $14, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_14_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_14_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15): - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_fwd) - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 -L(L15_fwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_loop_L2): - prefetchnta 0x1c0(%rsi) -L(shl_15_loop_L1): - sub $64, %rdx - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movdqa %xmm5, %xmm6 - palignr $15, %xmm4, %xmm5 - lea 64(%rsi), %rsi - palignr $15, %xmm3, %xmm4 - palignr $15, %xmm2, %xmm3 - lea 64(%rdi), %rdi - palignr $15, %xmm1, %xmm2 - movdqa %xmm6, %xmm1 - movdqa %xmm2, -0x40(%rdi) - movaps %xmm3, -0x30(%rdi) - jb L(shl_15_end) - movaps %xmm4, -0x20(%rdi) - movaps %xmm5, -0x10(%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_end): - movaps %xmm4, -0x20(%rdi) - lea 64(%rdx), %rdx - movaps %xmm5, -0x10(%rdi) - add %rdx, %rdi - movdqu %xmm0, (%r8) - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 - cmp %rcx, %rdx - movaps -0x0f(%rsi), %xmm1 - jb L(L15_bwd) - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 -L(L15_bwd): - lea -64(%rdx), %rdx - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_loop_L2): - prefetchnta -0x1c0(%rsi) -L(shl_15_bwd_loop_L1): - movaps -0x1f(%rsi), %xmm2 - sub $0x40, %rdx - movaps -0x2f(%rsi), %xmm3 - movaps -0x3f(%rsi), %xmm4 - movaps -0x4f(%rsi), %xmm5 - lea -0x40(%rsi), %rsi - palignr $15, %xmm2, %xmm1 - palignr $15, %xmm3, %xmm2 - palignr $15, %xmm4, %xmm3 - palignr $15, %xmm5, %xmm4 - - movaps %xmm1, -0x10(%rdi) - movaps %xmm5, %xmm1 - - movaps %xmm2, -0x20(%rdi) - lea -0x40(%rdi), %rdi - - movaps %xmm3, 0x10(%rdi) - jb L(shl_15_bwd_end) - movaps %xmm4, (%rdi) - _CET_NOTRACK jmp *%r9 - ud2 -L(shl_15_bwd_end): - movaps %xmm4, (%rdi) - lea 64(%rdx), %rdx - movdqu %xmm0, (%r8) - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) - - .p2align 4 -L(write_72bytes): - movdqu -72(%rsi), %xmm0 - movdqu -56(%rsi), %xmm1 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -72(%rdi) - movdqu %xmm1, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_64bytes): - movdqu -64(%rsi), %xmm0 - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - movdqu %xmm0, -64(%rdi) - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_56bytes): - movdqu -56(%rsi), %xmm0 - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rcx - movdqu %xmm0, -56(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(write_48bytes): - mov -48(%rsi), %rcx - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %rcx, -48(%rdi) - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_40bytes): - mov -40(%rsi), %r8 - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r8, -40(%rdi) - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_32bytes): - mov -32(%rsi), %r9 - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r9, -32(%rdi) - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_24bytes): - mov -24(%rsi), %r10 - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r10, -24(%rdi) - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_16bytes): - mov -16(%rsi), %r11 - mov -8(%rsi), %rdx - mov %r11, -16(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) -L(write_0bytes): - ret - - .p2align 4 -L(write_73bytes): - movdqu -73(%rsi), %xmm0 - movdqu -57(%rsi), %xmm1 - mov -41(%rsi), %rcx - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %r8 - mov -4(%rsi), %edx - movdqu %xmm0, -73(%rdi) - movdqu %xmm1, -57(%rdi) - mov %rcx, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %r8, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_65bytes): - movdqu -65(%rsi), %xmm0 - movdqu -49(%rsi), %xmm1 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -65(%rdi) - movdqu %xmm1, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_57bytes): - movdqu -57(%rsi), %xmm0 - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -57(%rdi) - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_49bytes): - movdqu -49(%rsi), %xmm0 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -49(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_41bytes): - mov -41(%rsi), %r8 - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r8, -41(%rdi) - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_33bytes): - mov -33(%rsi), %r9 - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r9, -33(%rdi) - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_25bytes): - mov -25(%rsi), %r10 - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -1(%rsi), %dl - mov %r10, -25(%rdi) - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_17bytes): - mov -17(%rsi), %r11 - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -17(%rdi) - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_9bytes): - mov -9(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -9(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_1bytes): - mov -1(%rsi), %dl - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(write_74bytes): - movdqu -74(%rsi), %xmm0 - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -74(%rdi) - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_66bytes): - movdqu -66(%rsi), %xmm0 - movdqu -50(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -66(%rdi) - movdqu %xmm1, -50(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_58bytes): - movdqu -58(%rsi), %xmm1 - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm1, -58(%rdi) - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_50bytes): - movdqu -50(%rsi), %xmm0 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -50(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_42bytes): - mov -42(%rsi), %r8 - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -42(%rdi) - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_34bytes): - mov -34(%rsi), %r9 - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -34(%rdi) - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_26bytes): - mov -26(%rsi), %r10 - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -26(%rdi) - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_18bytes): - mov -18(%rsi), %r11 - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -18(%rdi) - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_10bytes): - mov -10(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -10(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_2bytes): - mov -2(%rsi), %dx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(write_75bytes): - movdqu -75(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -75(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_67bytes): - movdqu -67(%rsi), %xmm0 - movdqu -59(%rsi), %xmm1 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -67(%rdi) - movdqu %xmm1, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_59bytes): - movdqu -59(%rsi), %xmm0 - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -59(%rdi) - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_51bytes): - movdqu -51(%rsi), %xmm0 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -51(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_43bytes): - mov -43(%rsi), %r8 - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -43(%rdi) - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_35bytes): - mov -35(%rsi), %r9 - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -35(%rdi) - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_27bytes): - mov -27(%rsi), %r10 - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -27(%rdi) - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_19bytes): - mov -19(%rsi), %r11 - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -19(%rdi) - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_11bytes): - mov -11(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -11(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(write_76bytes): - movdqu -76(%rsi), %xmm0 - movdqu -60(%rsi), %xmm1 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -76(%rdi) - movdqu %xmm1, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_68bytes): - movdqu -68(%rsi), %xmm0 - movdqu -52(%rsi), %xmm1 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -68(%rdi) - movdqu %xmm1, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_60bytes): - movdqu -60(%rsi), %xmm0 - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -60(%rdi) - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_52bytes): - movdqu -52(%rsi), %xmm0 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - movdqu %xmm0, -52(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_44bytes): - mov -44(%rsi), %r8 - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r8, -44(%rdi) - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_36bytes): - mov -36(%rsi), %r9 - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r9, -36(%rdi) - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_28bytes): - mov -28(%rsi), %r10 - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r10, -28(%rdi) - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_20bytes): - mov -20(%rsi), %r11 - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %r11, -20(%rdi) - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_12bytes): - mov -12(%rsi), %rcx - mov -4(%rsi), %edx - mov %rcx, -12(%rdi) - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(write_77bytes): - movdqu -77(%rsi), %xmm0 - movdqu -61(%rsi), %xmm1 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -77(%rdi) - movdqu %xmm1, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_69bytes): - movdqu -69(%rsi), %xmm0 - movdqu -53(%rsi), %xmm1 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -69(%rdi) - movdqu %xmm1, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_61bytes): - movdqu -61(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -61(%rdi) - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_53bytes): - movdqu -53(%rsi), %xmm0 - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -53(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_45bytes): - mov -45(%rsi), %r8 - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -45(%rdi) - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_37bytes): - mov -37(%rsi), %r9 - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -37(%rdi) - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_29bytes): - mov -29(%rsi), %r10 - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -29(%rdi) - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_21bytes): - mov -21(%rsi), %r11 - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -21(%rdi) - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_13bytes): - mov -13(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -13(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_78bytes): - movdqu -78(%rsi), %xmm0 - movdqu -62(%rsi), %xmm1 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -78(%rdi) - movdqu %xmm1, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_70bytes): - movdqu -70(%rsi), %xmm0 - movdqu -54(%rsi), %xmm1 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -70(%rdi) - movdqu %xmm1, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_62bytes): - movdqu -62(%rsi), %xmm0 - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -62(%rdi) - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_54bytes): - movdqu -54(%rsi), %xmm0 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -54(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_46bytes): - mov -46(%rsi), %r8 - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -46(%rdi) - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_38bytes): - mov -38(%rsi), %r9 - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -38(%rdi) - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_30bytes): - mov -30(%rsi), %r10 - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -30(%rdi) - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_22bytes): - mov -22(%rsi), %r11 - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -22(%rdi) - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_14bytes): - mov -14(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -14(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(write_79bytes): - movdqu -79(%rsi), %xmm0 - movdqu -63(%rsi), %xmm1 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -79(%rdi) - movdqu %xmm1, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_71bytes): - movdqu -71(%rsi), %xmm0 - movdqu -55(%rsi), %xmm1 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -71(%rdi) - movdqu %xmm1, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_63bytes): - movdqu -63(%rsi), %xmm0 - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -63(%rdi) - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_55bytes): - movdqu -55(%rsi), %xmm0 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - movdqu %xmm0, -55(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_47bytes): - mov -47(%rsi), %r8 - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r8, -47(%rdi) - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_39bytes): - mov -39(%rsi), %r9 - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r9, -39(%rdi) - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_31bytes): - mov -31(%rsi), %r10 - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r10, -31(%rdi) - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_23bytes): - mov -23(%rsi), %r11 - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %r11, -23(%rdi) - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_15bytes): - mov -15(%rsi), %rcx - mov -8(%rsi), %rdx - mov %rcx, -15(%rdi) - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(large_page_fwd): - movdqu (%rsi), %xmm1 - lea 16(%rsi), %rsi - movdqu %xmm0, (%r8) - movntdq %xmm1, (%rdi) - lea 16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - shl $2, %rcx - cmp %rcx, %rdx - jb L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif -L(large_page_loop): - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(large_page_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_64bytes): - add %rdx, %rsi - add %rdx, %rdi - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_fwd_start): - prefetcht0 0x1c0(%rsi) - prefetcht0 0x200(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lea 0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - movaps %xmm4, 0x40(%rdi) - movaps %xmm5, 0x50(%rdi) - movaps %xmm6, 0x60(%rdi) - movaps %xmm7, 0x70(%rdi) - lea 0x80(%rdi), %rdi - jae L(ll_cache_copy_fwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_fwd_64bytes) - - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - lea 0x40(%rsi), %rsi - - movaps %xmm0, (%rdi) - movaps %xmm1, 0x10(%rdi) - movaps %xmm2, 0x20(%rdi) - movaps %xmm3, 0x30(%rdi) - lea 0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_fwd_64bytes): - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#endif - .p2align 4 -L(large_page_bwd): - movdqu -0x10(%rsi), %xmm1 - lea -16(%rsi), %rsi - movdqu %xmm0, (%r8) - movdqa %xmm1, -0x10(%rdi) - lea -16(%rdi), %rdi - lea -0x90(%rdx), %rdx -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jb L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif -L(large_page_bwd_loop): - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - movntdq %xmm4, -0x50(%rdi) - movntdq %xmm5, -0x60(%rdi) - movntdq %xmm6, -0x70(%rdi) - movntdq %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(large_page_bwd_loop) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movntdq %xmm0, -0x10(%rdi) - movntdq %xmm1, -0x20(%rdi) - movntdq %xmm2, -0x30(%rdi) - movntdq %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_less_bwd_64bytes): - sfence - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) - -#ifdef USE_AS_MEMMOVE - .p2align 4 -L(ll_cache_copy_bwd_start): - prefetcht0 -0x1c0(%rsi) - prefetcht0 -0x200(%rsi) - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - movdqu -0x50(%rsi), %xmm4 - movdqu -0x60(%rsi), %xmm5 - movdqu -0x70(%rsi), %xmm6 - movdqu -0x80(%rsi), %xmm7 - lea -0x80(%rsi), %rsi - - sub $0x80, %rdx - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - movaps %xmm4, -0x50(%rdi) - movaps %xmm5, -0x60(%rdi) - movaps %xmm6, -0x70(%rdi) - movaps %xmm7, -0x80(%rdi) - lea -0x80(%rdi), %rdi - jae L(ll_cache_copy_bwd_start) - cmp $-0x40, %rdx - lea 0x80(%rdx), %rdx - jl L(large_page_ll_less_bwd_64bytes) - - movdqu -0x10(%rsi), %xmm0 - movdqu -0x20(%rsi), %xmm1 - movdqu -0x30(%rsi), %xmm2 - movdqu -0x40(%rsi), %xmm3 - lea -0x40(%rsi), %rsi - - movaps %xmm0, -0x10(%rdi) - movaps %xmm1, -0x20(%rdi) - movaps %xmm2, -0x30(%rdi) - movaps %xmm3, -0x40(%rdi) - lea -0x40(%rdi), %rdi - sub $0x40, %rdx -L(large_page_ll_less_bwd_64bytes): - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) -#endif - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_less_80bytes): - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) - - .p2align 3 -L(shl_table): - .int JMPTBL (L(shl_0), L(shl_table)) - .int JMPTBL (L(shl_1), L(shl_table)) - .int JMPTBL (L(shl_2), L(shl_table)) - .int JMPTBL (L(shl_3), L(shl_table)) - .int JMPTBL (L(shl_4), L(shl_table)) - .int JMPTBL (L(shl_5), L(shl_table)) - .int JMPTBL (L(shl_6), L(shl_table)) - .int JMPTBL (L(shl_7), L(shl_table)) - .int JMPTBL (L(shl_8), L(shl_table)) - .int JMPTBL (L(shl_9), L(shl_table)) - .int JMPTBL (L(shl_10), L(shl_table)) - .int JMPTBL (L(shl_11), L(shl_table)) - .int JMPTBL (L(shl_12), L(shl_table)) - .int JMPTBL (L(shl_13), L(shl_table)) - .int JMPTBL (L(shl_14), L(shl_table)) - .int JMPTBL (L(shl_15), L(shl_table)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S deleted file mode 100644 index 295430b1ef..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3 -#define MEMCPY_CHK __memmove_chk_ssse3 -#include "memcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein @ 2022-03-25 19:56 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - > sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +- > sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 -------------------- > sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 - > 5 files changed, 7 insertions(+), 3183 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index ed2def288d..48f81711ae 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -16,7 +16,6 @@ sysdep_routines += \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > memcmpeq-sse2 \ > - memcpy-ssse3 \ > memcpy-ssse3-back \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > @@ -24,7 +23,6 @@ sysdep_routines += \ > memmove-avx512-unaligned-erms \ > memmove-evex-unaligned-erms \ > memmove-sse2-unaligned-erms \ > - memmove-ssse3 \ > memmove-ssse3-back \ > memrchr-avx2 \ > memrchr-avx2-rtm \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 7e2be3554b..70b0e9c62e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (SSSE3), > __memmove_chk_ssse3_back) > - IFUNC_IMPL_ADD (array, i, __memmove_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memmove_chk_ssse3) > IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, > __memmove_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, > @@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memmove_avx512_unaligned_erms) > IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > __memmove_ssse3_back) > - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > - __memmove_ssse3) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) > IFUNC_IMPL_ADD (array, i, memmove, 1, > __memmove_sse2_unaligned) > @@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __memcpy_chk_ssse3_back) > - IFUNC_IMPL_ADD (array, i, __memcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memcpy_chk_ssse3) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, > __memcpy_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, > @@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memcpy_evex_unaligned_erms) > IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > __memcpy_ssse3_back) > - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > - __memcpy_ssse3) > IFUNC_IMPL_ADD (array, i, memcpy, > CPU_FEATURE_USABLE (AVX512F), > __memcpy_avx512_no_vzeroupper) > @@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (SSSE3), > __mempcpy_chk_ssse3_back) > - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_chk_ssse3) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, > __mempcpy_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, > @@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __mempcpy_evex_unaligned_erms) > IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > __mempcpy_ssse3_back) > - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_ssse3) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > __mempcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h > index f8f958064c..1ecdd4b0d3 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h > @@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) > attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) > attribute_hidden; > @@ -94,17 +92,15 @@ IFUNC_SELECTOR (void) > } > } > > - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > { > - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > - return OPTIMIZE (sse2_unaligned_erms); > - > - return OPTIMIZE (sse2_unaligned); > + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) > + return OPTIMIZE (ssse3_back); > } > > - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) > - return OPTIMIZE (ssse3_back); > + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > + return OPTIMIZE (sse2_unaligned_erms); > > - return OPTIMIZE (ssse3); > + return OPTIMIZE (sse2_unaligned); > } > diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S > deleted file mode 100644 > index 65644d3a09..0000000000 > --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S > +++ /dev/null > @@ -1,3151 +0,0 @@ > -/* memcpy with SSSE3 > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#if IS_IN (libc) > - > -#include "asm-syntax.h" > - > -#ifndef MEMCPY > -# define MEMCPY __memcpy_ssse3 > -# define MEMCPY_CHK __memcpy_chk_ssse3 > -# define MEMPCPY __mempcpy_ssse3 > -# define MEMPCPY_CHK __mempcpy_chk_ssse3 > -#endif > - > -#define JMPTBL(I, B) I - B > - > -/* Branch to an entry in a jump table. TABLE is a jump table with > - relative offsets. INDEX is a register contains the index into the > - jump table. SCALE is the scale of INDEX. */ > -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), INDEX; \ > - lea (%r11, INDEX), INDEX; \ > - _CET_NOTRACK jmp *INDEX; \ > - ud2 > - > - .section .text.ssse3,"ax",@progbits > -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE > -ENTRY (MEMPCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMPCPY_CHK) > - > -ENTRY (MEMPCPY) > - mov %RDI_LP, %RAX_LP > - add %RDX_LP, %RAX_LP > - jmp L(start) > -END (MEMPCPY) > -#endif > - > -#if !defined USE_AS_BCOPY > -ENTRY (MEMCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMCPY_CHK) > -#endif > - > -ENTRY (MEMCPY) > - mov %RDI_LP, %RAX_LP > -#ifdef USE_AS_MEMPCPY > - add %RDX_LP, %RAX_LP > -#endif > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -#endif > - > -#ifdef USE_AS_MEMMOVE > - cmp %rsi, %rdi > - jb L(copy_forward) > - je L(write_0bytes) > - cmp $79, %rdx > - jbe L(copy_forward) > - jmp L(copy_backward) > -L(copy_forward): > -#endif > -L(start): > - cmp $79, %rdx > - lea L(table_less_80bytes)(%rip), %r11 > - ja L(80bytesormore) > - movslq (%r11, %rdx, 4), %r9 > - add %rdx, %rsi > - add %rdx, %rdi > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(80bytesormore): > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jle L(copy_backward) > -#endif > - > - movdqu (%rsi), %xmm0 > - mov %rdi, %rcx > - and $-16, %rdi > - add $16, %rdi > - mov %rcx, %r8 > - sub %rdi, %rcx > - add %rcx, %rdx > - sub %rcx, %rsi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > - cmp %rcx, %rdx > - mov %rsi, %r9 > - ja L(large_page_fwd) > - and $0xf, %r9 > - jz L(shl_0) > -#ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_data_cache_size_half(%rip), %RCX_LP > -#endif > - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4) > - > - .p2align 4 > -L(copy_backward): > - movdqu -16(%rsi, %rdx), %xmm0 > - add %rdx, %rsi > - lea -16(%rdi, %rdx), %r8 > - add %rdx, %rdi > - > - mov %rdi, %rcx > - and $0xf, %rcx > - xor %rcx, %rdi > - sub %rcx, %rdx > - sub %rcx, %rsi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > - > - cmp %rcx, %rdx > - mov %rsi, %r9 > - ja L(large_page_bwd) > - and $0xf, %r9 > - jz L(shl_0_bwd) > -#ifdef DATA_CACHE_SIZE_HALF > - mov $DATA_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_data_cache_size_half(%rip), %RCX_LP > -#endif > - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4) > - > - .p2align 4 > -L(shl_0): > - sub $16, %rdx > - movdqa (%rsi), %xmm1 > - add $16, %rsi > - movdqa %xmm1, (%rdi) > - add $16, %rdi > - cmp $128, %rdx > - movdqu %xmm0, (%r8) > - ja L(shl_0_gobble) > - cmp $64, %rdx > - jb L(shl_0_less_64bytes) > - movaps (%rsi), %xmm4 > - movaps 16(%rsi), %xmm1 > - movaps 32(%rsi), %xmm2 > - movaps 48(%rsi), %xmm3 > - movaps %xmm4, (%rdi) > - movaps %xmm1, 16(%rdi) > - movaps %xmm2, 32(%rdi) > - movaps %xmm3, 48(%rdi) > - sub $64, %rdx > - add $64, %rsi > - add $64, %rdi > -L(shl_0_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble): > -#ifdef DATA_CACHE_SIZE_HALF > - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %RDX_LP > -#endif > - lea -128(%rdx), %rdx > - jae L(shl_0_gobble_mem_loop) > -L(shl_0_gobble_cache_loop): > - movdqa (%rsi), %xmm4 > - movaps 0x10(%rsi), %xmm1 > - movaps 0x20(%rsi), %xmm2 > - movaps 0x30(%rsi), %xmm3 > - > - movdqa %xmm4, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - > - sub $128, %rdx > - movaps 0x40(%rsi), %xmm4 > - movaps 0x50(%rsi), %xmm5 > - movaps 0x60(%rsi), %xmm6 > - movaps 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - movaps %xmm4, 0x40(%rdi) > - movaps %xmm5, 0x50(%rdi) > - movaps %xmm6, 0x60(%rdi) > - movaps %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_cache_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_cache_less_64bytes) > - > - movdqa (%rsi), %xmm4 > - sub $0x40, %rdx > - movdqa 0x10(%rsi), %xmm1 > - > - movdqa %xmm4, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - > - movdqa 0x20(%rsi), %xmm4 > - movdqa 0x30(%rsi), %xmm1 > - add $0x40, %rsi > - > - movdqa %xmm4, 0x20(%rdi) > - movdqa %xmm1, 0x30(%rdi) > - add $0x40, %rdi > -L(shl_0_cache_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_mem_loop): > - prefetcht0 0x1c0(%rsi) > - prefetcht0 0x280(%rsi) > - > - movdqa (%rsi), %xmm0 > - movdqa 0x10(%rsi), %xmm1 > - movdqa 0x20(%rsi), %xmm2 > - movdqa 0x30(%rsi), %xmm3 > - movdqa 0x40(%rsi), %xmm4 > - movdqa 0x50(%rsi), %xmm5 > - movdqa 0x60(%rsi), %xmm6 > - movdqa 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - sub $0x80, %rdx > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - movdqa %xmm2, 0x20(%rdi) > - movdqa %xmm3, 0x30(%rdi) > - movdqa %xmm4, 0x40(%rdi) > - movdqa %xmm5, 0x50(%rdi) > - movdqa %xmm6, 0x60(%rdi) > - movdqa %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_mem_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_mem_less_64bytes) > - > - movdqa (%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa 0x10(%rsi), %xmm1 > - > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - > - movdqa 0x20(%rsi), %xmm0 > - movdqa 0x30(%rsi), %xmm1 > - add $0x40, %rsi > - > - movdqa %xmm0, 0x20(%rdi) > - movdqa %xmm1, 0x30(%rdi) > - add $0x40, %rdi > -L(shl_0_mem_less_64bytes): > - cmp $0x20, %rdx > - jb L(shl_0_mem_less_32bytes) > - movdqa (%rsi), %xmm0 > - sub $0x20, %rdx > - movdqa 0x10(%rsi), %xmm1 > - add $0x20, %rsi > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - add $0x20, %rdi > -L(shl_0_mem_less_32bytes): > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_bwd): > - sub $16, %rdx > - movdqa -0x10(%rsi), %xmm1 > - sub $16, %rsi > - movdqa %xmm1, -0x10(%rdi) > - sub $16, %rdi > - cmp $0x80, %rdx > - movdqu %xmm0, (%r8) > - ja L(shl_0_gobble_bwd) > - cmp $64, %rdx > - jb L(shl_0_less_64bytes_bwd) > - movaps -0x10(%rsi), %xmm0 > - movaps -0x20(%rsi), %xmm1 > - movaps -0x30(%rsi), %xmm2 > - movaps -0x40(%rsi), %xmm3 > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - sub $64, %rdx > - sub $0x40, %rsi > - sub $0x40, %rdi > -L(shl_0_less_64bytes_bwd): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_bwd): > -#ifdef DATA_CACHE_SIZE_HALF > - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %RDX_LP > -#endif > - lea -128(%rdx), %rdx > - jae L(shl_0_gobble_mem_bwd_loop) > -L(shl_0_gobble_bwd_loop): > - movdqa -0x10(%rsi), %xmm0 > - movaps -0x20(%rsi), %xmm1 > - movaps -0x30(%rsi), %xmm2 > - movaps -0x40(%rsi), %xmm3 > - > - movdqa %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - > - sub $0x80, %rdx > - movaps -0x50(%rsi), %xmm4 > - movaps -0x60(%rsi), %xmm5 > - movaps -0x70(%rsi), %xmm6 > - movaps -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - movaps %xmm4, -0x50(%rdi) > - movaps %xmm5, -0x60(%rdi) > - movaps %xmm6, -0x70(%rdi) > - movaps %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_gobble_bwd_less_64bytes) > - > - movdqa -0x10(%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa -0x20(%rsi), %xmm1 > - > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - > - movdqa -0x30(%rsi), %xmm0 > - movdqa -0x40(%rsi), %xmm1 > - sub $0x40, %rsi > - > - movdqa %xmm0, -0x30(%rdi) > - movdqa %xmm1, -0x40(%rdi) > - sub $0x40, %rdi > -L(shl_0_gobble_bwd_less_64bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_0_gobble_mem_bwd_loop): > - prefetcht0 -0x1c0(%rsi) > - prefetcht0 -0x280(%rsi) > - movdqa -0x10(%rsi), %xmm0 > - movdqa -0x20(%rsi), %xmm1 > - movdqa -0x30(%rsi), %xmm2 > - movdqa -0x40(%rsi), %xmm3 > - movdqa -0x50(%rsi), %xmm4 > - movdqa -0x60(%rsi), %xmm5 > - movdqa -0x70(%rsi), %xmm6 > - movdqa -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - sub $0x80, %rdx > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - movdqa %xmm2, -0x30(%rdi) > - movdqa %xmm3, -0x40(%rdi) > - movdqa %xmm4, -0x50(%rdi) > - movdqa %xmm5, -0x60(%rdi) > - movdqa %xmm6, -0x70(%rdi) > - movdqa %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - > - jae L(shl_0_gobble_mem_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(shl_0_mem_bwd_less_64bytes) > - > - movdqa -0x10(%rsi), %xmm0 > - sub $0x40, %rdx > - movdqa -0x20(%rsi), %xmm1 > - > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - > - movdqa -0x30(%rsi), %xmm0 > - movdqa -0x40(%rsi), %xmm1 > - sub $0x40, %rsi > - > - movdqa %xmm0, -0x30(%rdi) > - movdqa %xmm1, -0x40(%rdi) > - sub $0x40, %rdi > -L(shl_0_mem_bwd_less_64bytes): > - cmp $0x20, %rdx > - jb L(shl_0_mem_bwd_less_32bytes) > - movdqa -0x10(%rsi), %xmm0 > - sub $0x20, %rdx > - movdqa -0x20(%rsi), %xmm1 > - sub $0x20, %rsi > - movdqa %xmm0, -0x10(%rdi) > - movdqa %xmm1, -0x20(%rdi) > - sub $0x20, %rdi > -L(shl_0_mem_bwd_less_32bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_1): > - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x01(%rsi), %xmm1 > - jb L(L1_fwd) > - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9 > -L(L1_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_1_loop_L1): > - sub $64, %rdx > - movaps 0x0f(%rsi), %xmm2 > - movaps 0x1f(%rsi), %xmm3 > - movaps 0x2f(%rsi), %xmm4 > - movaps 0x3f(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $1, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $1, %xmm3, %xmm4 > - palignr $1, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $1, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_1_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_1_bwd): > - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x01(%rsi), %xmm1 > - jb L(L1_bwd) > - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9 > -L(L1_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_1_bwd_loop_L1): > - movaps -0x11(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x21(%rsi), %xmm3 > - movaps -0x31(%rsi), %xmm4 > - movaps -0x41(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $1, %xmm2, %xmm1 > - palignr $1, %xmm3, %xmm2 > - palignr $1, %xmm4, %xmm3 > - palignr $1, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_1_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_1_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_2): > - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x02(%rsi), %xmm1 > - jb L(L2_fwd) > - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9 > -L(L2_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_2_loop_L1): > - sub $64, %rdx > - movaps 0x0e(%rsi), %xmm2 > - movaps 0x1e(%rsi), %xmm3 > - movaps 0x2e(%rsi), %xmm4 > - movaps 0x3e(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $2, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $2, %xmm3, %xmm4 > - palignr $2, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $2, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_2_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_2_bwd): > - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x02(%rsi), %xmm1 > - jb L(L2_bwd) > - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9 > -L(L2_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_2_bwd_loop_L1): > - movaps -0x12(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x22(%rsi), %xmm3 > - movaps -0x32(%rsi), %xmm4 > - movaps -0x42(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $2, %xmm2, %xmm1 > - palignr $2, %xmm3, %xmm2 > - palignr $2, %xmm4, %xmm3 > - palignr $2, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_2_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_2_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_3): > - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x03(%rsi), %xmm1 > - jb L(L3_fwd) > - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9 > -L(L3_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_3_loop_L1): > - sub $64, %rdx > - movaps 0x0d(%rsi), %xmm2 > - movaps 0x1d(%rsi), %xmm3 > - movaps 0x2d(%rsi), %xmm4 > - movaps 0x3d(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $3, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $3, %xmm3, %xmm4 > - palignr $3, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $3, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_3_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_3_bwd): > - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x03(%rsi), %xmm1 > - jb L(L3_bwd) > - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9 > -L(L3_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_3_bwd_loop_L1): > - movaps -0x13(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x23(%rsi), %xmm3 > - movaps -0x33(%rsi), %xmm4 > - movaps -0x43(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $3, %xmm2, %xmm1 > - palignr $3, %xmm3, %xmm2 > - palignr $3, %xmm4, %xmm3 > - palignr $3, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_3_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_3_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_4): > - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x04(%rsi), %xmm1 > - jb L(L4_fwd) > - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9 > -L(L4_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_4_loop_L1): > - sub $64, %rdx > - movaps 0x0c(%rsi), %xmm2 > - movaps 0x1c(%rsi), %xmm3 > - movaps 0x2c(%rsi), %xmm4 > - movaps 0x3c(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $4, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $4, %xmm3, %xmm4 > - palignr $4, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $4, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_4_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_4_bwd): > - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x04(%rsi), %xmm1 > - jb L(L4_bwd) > - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9 > -L(L4_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_4_bwd_loop_L1): > - movaps -0x14(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x24(%rsi), %xmm3 > - movaps -0x34(%rsi), %xmm4 > - movaps -0x44(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $4, %xmm2, %xmm1 > - palignr $4, %xmm3, %xmm2 > - palignr $4, %xmm4, %xmm3 > - palignr $4, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_4_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_4_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_5): > - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x05(%rsi), %xmm1 > - jb L(L5_fwd) > - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9 > -L(L5_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_5_loop_L1): > - sub $64, %rdx > - movaps 0x0b(%rsi), %xmm2 > - movaps 0x1b(%rsi), %xmm3 > - movaps 0x2b(%rsi), %xmm4 > - movaps 0x3b(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $5, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $5, %xmm3, %xmm4 > - palignr $5, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $5, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_5_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_5_bwd): > - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x05(%rsi), %xmm1 > - jb L(L5_bwd) > - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9 > -L(L5_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_5_bwd_loop_L1): > - movaps -0x15(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x25(%rsi), %xmm3 > - movaps -0x35(%rsi), %xmm4 > - movaps -0x45(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $5, %xmm2, %xmm1 > - palignr $5, %xmm3, %xmm2 > - palignr $5, %xmm4, %xmm3 > - palignr $5, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_5_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_5_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_6): > - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x06(%rsi), %xmm1 > - jb L(L6_fwd) > - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9 > -L(L6_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_6_loop_L1): > - sub $64, %rdx > - movaps 0x0a(%rsi), %xmm2 > - movaps 0x1a(%rsi), %xmm3 > - movaps 0x2a(%rsi), %xmm4 > - movaps 0x3a(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $6, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $6, %xmm3, %xmm4 > - palignr $6, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $6, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_6_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_6_bwd): > - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x06(%rsi), %xmm1 > - jb L(L6_bwd) > - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9 > -L(L6_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_6_bwd_loop_L1): > - movaps -0x16(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x26(%rsi), %xmm3 > - movaps -0x36(%rsi), %xmm4 > - movaps -0x46(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $6, %xmm2, %xmm1 > - palignr $6, %xmm3, %xmm2 > - palignr $6, %xmm4, %xmm3 > - palignr $6, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_6_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_6_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_7): > - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x07(%rsi), %xmm1 > - jb L(L7_fwd) > - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9 > -L(L7_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_7_loop_L1): > - sub $64, %rdx > - movaps 0x09(%rsi), %xmm2 > - movaps 0x19(%rsi), %xmm3 > - movaps 0x29(%rsi), %xmm4 > - movaps 0x39(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $7, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $7, %xmm3, %xmm4 > - palignr $7, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $7, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_7_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_7_bwd): > - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x07(%rsi), %xmm1 > - jb L(L7_bwd) > - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9 > -L(L7_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_7_bwd_loop_L1): > - movaps -0x17(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x27(%rsi), %xmm3 > - movaps -0x37(%rsi), %xmm4 > - movaps -0x47(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $7, %xmm2, %xmm1 > - palignr $7, %xmm3, %xmm2 > - palignr $7, %xmm4, %xmm3 > - palignr $7, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_7_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_7_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_8): > - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x08(%rsi), %xmm1 > - jb L(L8_fwd) > - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9 > -L(L8_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > -L(shl_8_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_8_loop_L1): > - sub $64, %rdx > - movaps 0x08(%rsi), %xmm2 > - movaps 0x18(%rsi), %xmm3 > - movaps 0x28(%rsi), %xmm4 > - movaps 0x38(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $8, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $8, %xmm3, %xmm4 > - palignr $8, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $8, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_8_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > - .p2align 4 > -L(shl_8_end): > - lea 64(%rdx), %rdx > - movaps %xmm4, -0x20(%rdi) > - add %rdx, %rsi > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_8_bwd): > - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x08(%rsi), %xmm1 > - jb L(L8_bwd) > - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9 > -L(L8_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_8_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_8_bwd_loop_L1): > - movaps -0x18(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x28(%rsi), %xmm3 > - movaps -0x38(%rsi), %xmm4 > - movaps -0x48(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $8, %xmm2, %xmm1 > - palignr $8, %xmm3, %xmm2 > - palignr $8, %xmm4, %xmm3 > - palignr $8, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_8_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_8_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_9): > - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x09(%rsi), %xmm1 > - jb L(L9_fwd) > - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9 > -L(L9_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_9_loop_L1): > - sub $64, %rdx > - movaps 0x07(%rsi), %xmm2 > - movaps 0x17(%rsi), %xmm3 > - movaps 0x27(%rsi), %xmm4 > - movaps 0x37(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $9, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $9, %xmm3, %xmm4 > - palignr $9, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $9, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_9_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_9_bwd): > - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x09(%rsi), %xmm1 > - jb L(L9_bwd) > - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9 > -L(L9_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_9_bwd_loop_L1): > - movaps -0x19(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x29(%rsi), %xmm3 > - movaps -0x39(%rsi), %xmm4 > - movaps -0x49(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $9, %xmm2, %xmm1 > - palignr $9, %xmm3, %xmm2 > - palignr $9, %xmm4, %xmm3 > - palignr $9, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_9_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_9_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_10): > - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0a(%rsi), %xmm1 > - jb L(L10_fwd) > - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9 > -L(L10_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_10_loop_L1): > - sub $64, %rdx > - movaps 0x06(%rsi), %xmm2 > - movaps 0x16(%rsi), %xmm3 > - movaps 0x26(%rsi), %xmm4 > - movaps 0x36(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $10, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $10, %xmm3, %xmm4 > - palignr $10, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $10, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_10_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_10_bwd): > - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0a(%rsi), %xmm1 > - jb L(L10_bwd) > - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9 > -L(L10_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_10_bwd_loop_L1): > - movaps -0x1a(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2a(%rsi), %xmm3 > - movaps -0x3a(%rsi), %xmm4 > - movaps -0x4a(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $10, %xmm2, %xmm1 > - palignr $10, %xmm3, %xmm2 > - palignr $10, %xmm4, %xmm3 > - palignr $10, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_10_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_10_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_11): > - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0b(%rsi), %xmm1 > - jb L(L11_fwd) > - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9 > -L(L11_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_11_loop_L1): > - sub $64, %rdx > - movaps 0x05(%rsi), %xmm2 > - movaps 0x15(%rsi), %xmm3 > - movaps 0x25(%rsi), %xmm4 > - movaps 0x35(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $11, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $11, %xmm3, %xmm4 > - palignr $11, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $11, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_11_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_11_bwd): > - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0b(%rsi), %xmm1 > - jb L(L11_bwd) > - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9 > -L(L11_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_11_bwd_loop_L1): > - movaps -0x1b(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2b(%rsi), %xmm3 > - movaps -0x3b(%rsi), %xmm4 > - movaps -0x4b(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $11, %xmm2, %xmm1 > - palignr $11, %xmm3, %xmm2 > - palignr $11, %xmm4, %xmm3 > - palignr $11, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_11_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_11_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_12): > - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0c(%rsi), %xmm1 > - jb L(L12_fwd) > - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9 > -L(L12_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_12_loop_L1): > - sub $64, %rdx > - movaps 0x04(%rsi), %xmm2 > - movaps 0x14(%rsi), %xmm3 > - movaps 0x24(%rsi), %xmm4 > - movaps 0x34(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $12, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $12, %xmm3, %xmm4 > - palignr $12, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $12, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_12_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_12_bwd): > - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0c(%rsi), %xmm1 > - jb L(L12_bwd) > - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9 > -L(L12_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_12_bwd_loop_L1): > - movaps -0x1c(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2c(%rsi), %xmm3 > - movaps -0x3c(%rsi), %xmm4 > - movaps -0x4c(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $12, %xmm2, %xmm1 > - palignr $12, %xmm3, %xmm2 > - palignr $12, %xmm4, %xmm3 > - palignr $12, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_12_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_12_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_13): > - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0d(%rsi), %xmm1 > - jb L(L13_fwd) > - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9 > -L(L13_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_13_loop_L1): > - sub $64, %rdx > - movaps 0x03(%rsi), %xmm2 > - movaps 0x13(%rsi), %xmm3 > - movaps 0x23(%rsi), %xmm4 > - movaps 0x33(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $13, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $13, %xmm3, %xmm4 > - palignr $13, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $13, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_13_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_13_bwd): > - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0d(%rsi), %xmm1 > - jb L(L13_bwd) > - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9 > -L(L13_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_13_bwd_loop_L1): > - movaps -0x1d(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2d(%rsi), %xmm3 > - movaps -0x3d(%rsi), %xmm4 > - movaps -0x4d(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $13, %xmm2, %xmm1 > - palignr $13, %xmm3, %xmm2 > - palignr $13, %xmm4, %xmm3 > - palignr $13, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_13_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_13_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_14): > - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0e(%rsi), %xmm1 > - jb L(L14_fwd) > - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9 > -L(L14_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_14_loop_L1): > - sub $64, %rdx > - movaps 0x02(%rsi), %xmm2 > - movaps 0x12(%rsi), %xmm3 > - movaps 0x22(%rsi), %xmm4 > - movaps 0x32(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $14, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $14, %xmm3, %xmm4 > - palignr $14, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $14, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_14_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_14_bwd): > - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0e(%rsi), %xmm1 > - jb L(L14_bwd) > - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9 > -L(L14_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_14_bwd_loop_L1): > - movaps -0x1e(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2e(%rsi), %xmm3 > - movaps -0x3e(%rsi), %xmm4 > - movaps -0x4e(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $14, %xmm2, %xmm1 > - palignr $14, %xmm3, %xmm2 > - palignr $14, %xmm4, %xmm3 > - palignr $14, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_14_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_14_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_15): > - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0f(%rsi), %xmm1 > - jb L(L15_fwd) > - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9 > -L(L15_fwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_loop_L2): > - prefetchnta 0x1c0(%rsi) > -L(shl_15_loop_L1): > - sub $64, %rdx > - movaps 0x01(%rsi), %xmm2 > - movaps 0x11(%rsi), %xmm3 > - movaps 0x21(%rsi), %xmm4 > - movaps 0x31(%rsi), %xmm5 > - movdqa %xmm5, %xmm6 > - palignr $15, %xmm4, %xmm5 > - lea 64(%rsi), %rsi > - palignr $15, %xmm3, %xmm4 > - palignr $15, %xmm2, %xmm3 > - lea 64(%rdi), %rdi > - palignr $15, %xmm1, %xmm2 > - movdqa %xmm6, %xmm1 > - movdqa %xmm2, -0x40(%rdi) > - movaps %xmm3, -0x30(%rdi) > - jb L(shl_15_end) > - movaps %xmm4, -0x20(%rdi) > - movaps %xmm5, -0x10(%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_end): > - movaps %xmm4, -0x20(%rdi) > - lea 64(%rdx), %rdx > - movaps %xmm5, -0x10(%rdi) > - add %rdx, %rdi > - movdqu %xmm0, (%r8) > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(shl_15_bwd): > - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9 > - cmp %rcx, %rdx > - movaps -0x0f(%rsi), %xmm1 > - jb L(L15_bwd) > - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9 > -L(L15_bwd): > - lea -64(%rdx), %rdx > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_bwd_loop_L2): > - prefetchnta -0x1c0(%rsi) > -L(shl_15_bwd_loop_L1): > - movaps -0x1f(%rsi), %xmm2 > - sub $0x40, %rdx > - movaps -0x2f(%rsi), %xmm3 > - movaps -0x3f(%rsi), %xmm4 > - movaps -0x4f(%rsi), %xmm5 > - lea -0x40(%rsi), %rsi > - palignr $15, %xmm2, %xmm1 > - palignr $15, %xmm3, %xmm2 > - palignr $15, %xmm4, %xmm3 > - palignr $15, %xmm5, %xmm4 > - > - movaps %xmm1, -0x10(%rdi) > - movaps %xmm5, %xmm1 > - > - movaps %xmm2, -0x20(%rdi) > - lea -0x40(%rdi), %rdi > - > - movaps %xmm3, 0x10(%rdi) > - jb L(shl_15_bwd_end) > - movaps %xmm4, (%rdi) > - _CET_NOTRACK jmp *%r9 > - ud2 > -L(shl_15_bwd_end): > - movaps %xmm4, (%rdi) > - lea 64(%rdx), %rdx > - movdqu %xmm0, (%r8) > - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4) > - > - .p2align 4 > -L(write_72bytes): > - movdqu -72(%rsi), %xmm0 > - movdqu -56(%rsi), %xmm1 > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rcx > - movdqu %xmm0, -72(%rdi) > - movdqu %xmm1, -56(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_64bytes): > - movdqu -64(%rsi), %xmm0 > - mov -48(%rsi), %rcx > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - movdqu %xmm0, -64(%rdi) > - mov %rcx, -48(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_56bytes): > - movdqu -56(%rsi), %xmm0 > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rcx > - movdqu %xmm0, -56(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_48bytes): > - mov -48(%rsi), %rcx > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %rcx, -48(%rdi) > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_40bytes): > - mov -40(%rsi), %r8 > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r8, -40(%rdi) > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_32bytes): > - mov -32(%rsi), %r9 > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r9, -32(%rdi) > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_24bytes): > - mov -24(%rsi), %r10 > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r10, -24(%rdi) > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_16bytes): > - mov -16(%rsi), %r11 > - mov -8(%rsi), %rdx > - mov %r11, -16(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_8bytes): > - mov -8(%rsi), %rdx > - mov %rdx, -8(%rdi) > -L(write_0bytes): > - ret > - > - .p2align 4 > -L(write_73bytes): > - movdqu -73(%rsi), %xmm0 > - movdqu -57(%rsi), %xmm1 > - mov -41(%rsi), %rcx > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %r8 > - mov -4(%rsi), %edx > - movdqu %xmm0, -73(%rdi) > - movdqu %xmm1, -57(%rdi) > - mov %rcx, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %r8, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_65bytes): > - movdqu -65(%rsi), %xmm0 > - movdqu -49(%rsi), %xmm1 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -65(%rdi) > - movdqu %xmm1, -49(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_57bytes): > - movdqu -57(%rsi), %xmm0 > - mov -41(%rsi), %r8 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -57(%rdi) > - mov %r8, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_49bytes): > - movdqu -49(%rsi), %xmm0 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -49(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_41bytes): > - mov -41(%rsi), %r8 > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r8, -41(%rdi) > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_33bytes): > - mov -33(%rsi), %r9 > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r9, -33(%rdi) > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_25bytes): > - mov -25(%rsi), %r10 > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -1(%rsi), %dl > - mov %r10, -25(%rdi) > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_17bytes): > - mov -17(%rsi), %r11 > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -17(%rdi) > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_9bytes): > - mov -9(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -9(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_1bytes): > - mov -1(%rsi), %dl > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(write_74bytes): > - movdqu -74(%rsi), %xmm0 > - movdqu -58(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -74(%rdi) > - movdqu %xmm1, -58(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_66bytes): > - movdqu -66(%rsi), %xmm0 > - movdqu -50(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -66(%rdi) > - movdqu %xmm1, -50(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_58bytes): > - movdqu -58(%rsi), %xmm1 > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm1, -58(%rdi) > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_50bytes): > - movdqu -50(%rsi), %xmm0 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -50(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_42bytes): > - mov -42(%rsi), %r8 > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -42(%rdi) > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_34bytes): > - mov -34(%rsi), %r9 > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -34(%rdi) > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_26bytes): > - mov -26(%rsi), %r10 > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -26(%rdi) > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_18bytes): > - mov -18(%rsi), %r11 > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -18(%rdi) > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_10bytes): > - mov -10(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -10(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_2bytes): > - mov -2(%rsi), %dx > - mov %dx, -2(%rdi) > - ret > - > - .p2align 4 > -L(write_75bytes): > - movdqu -75(%rsi), %xmm0 > - movdqu -59(%rsi), %xmm1 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -75(%rdi) > - movdqu %xmm1, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_67bytes): > - movdqu -67(%rsi), %xmm0 > - movdqu -59(%rsi), %xmm1 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -67(%rdi) > - movdqu %xmm1, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_59bytes): > - movdqu -59(%rsi), %xmm0 > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -59(%rdi) > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_51bytes): > - movdqu -51(%rsi), %xmm0 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -51(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_43bytes): > - mov -43(%rsi), %r8 > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -43(%rdi) > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_35bytes): > - mov -35(%rsi), %r9 > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -35(%rdi) > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_27bytes): > - mov -27(%rsi), %r10 > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -27(%rdi) > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_19bytes): > - mov -19(%rsi), %r11 > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -19(%rdi) > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_11bytes): > - mov -11(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -11(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_3bytes): > - mov -3(%rsi), %dx > - mov -2(%rsi), %cx > - mov %dx, -3(%rdi) > - mov %cx, -2(%rdi) > - ret > - > - .p2align 4 > -L(write_76bytes): > - movdqu -76(%rsi), %xmm0 > - movdqu -60(%rsi), %xmm1 > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -76(%rdi) > - movdqu %xmm1, -60(%rdi) > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_68bytes): > - movdqu -68(%rsi), %xmm0 > - movdqu -52(%rsi), %xmm1 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -68(%rdi) > - movdqu %xmm1, -52(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_60bytes): > - movdqu -60(%rsi), %xmm0 > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -60(%rdi) > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_52bytes): > - movdqu -52(%rsi), %xmm0 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - movdqu %xmm0, -52(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_44bytes): > - mov -44(%rsi), %r8 > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r8, -44(%rdi) > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_36bytes): > - mov -36(%rsi), %r9 > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r9, -36(%rdi) > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_28bytes): > - mov -28(%rsi), %r10 > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r10, -28(%rdi) > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_20bytes): > - mov -20(%rsi), %r11 > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %r11, -20(%rdi) > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_12bytes): > - mov -12(%rsi), %rcx > - mov -4(%rsi), %edx > - mov %rcx, -12(%rdi) > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_4bytes): > - mov -4(%rsi), %edx > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_77bytes): > - movdqu -77(%rsi), %xmm0 > - movdqu -61(%rsi), %xmm1 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -77(%rdi) > - movdqu %xmm1, -61(%rdi) > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_69bytes): > - movdqu -69(%rsi), %xmm0 > - movdqu -53(%rsi), %xmm1 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -69(%rdi) > - movdqu %xmm1, -53(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_61bytes): > - movdqu -61(%rsi), %xmm0 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -61(%rdi) > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_53bytes): > - movdqu -53(%rsi), %xmm0 > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -53(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_45bytes): > - mov -45(%rsi), %r8 > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -45(%rdi) > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_37bytes): > - mov -37(%rsi), %r9 > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -37(%rdi) > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_29bytes): > - mov -29(%rsi), %r10 > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -29(%rdi) > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_21bytes): > - mov -21(%rsi), %r11 > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -21(%rdi) > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_13bytes): > - mov -13(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -13(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_5bytes): > - mov -5(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -5(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_78bytes): > - movdqu -78(%rsi), %xmm0 > - movdqu -62(%rsi), %xmm1 > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -78(%rdi) > - movdqu %xmm1, -62(%rdi) > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_70bytes): > - movdqu -70(%rsi), %xmm0 > - movdqu -54(%rsi), %xmm1 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -70(%rdi) > - movdqu %xmm1, -54(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_62bytes): > - movdqu -62(%rsi), %xmm0 > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -62(%rdi) > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_54bytes): > - movdqu -54(%rsi), %xmm0 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -54(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_46bytes): > - mov -46(%rsi), %r8 > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -46(%rdi) > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_38bytes): > - mov -38(%rsi), %r9 > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -38(%rdi) > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_30bytes): > - mov -30(%rsi), %r10 > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -30(%rdi) > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_22bytes): > - mov -22(%rsi), %r11 > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -22(%rdi) > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_14bytes): > - mov -14(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -14(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_6bytes): > - mov -6(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -6(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(write_79bytes): > - movdqu -79(%rsi), %xmm0 > - movdqu -63(%rsi), %xmm1 > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -79(%rdi) > - movdqu %xmm1, -63(%rdi) > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_71bytes): > - movdqu -71(%rsi), %xmm0 > - movdqu -55(%rsi), %xmm1 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -71(%rdi) > - movdqu %xmm1, -55(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_63bytes): > - movdqu -63(%rsi), %xmm0 > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -63(%rdi) > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_55bytes): > - movdqu -55(%rsi), %xmm0 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - movdqu %xmm0, -55(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_47bytes): > - mov -47(%rsi), %r8 > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r8, -47(%rdi) > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_39bytes): > - mov -39(%rsi), %r9 > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r9, -39(%rdi) > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_31bytes): > - mov -31(%rsi), %r10 > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r10, -31(%rdi) > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_23bytes): > - mov -23(%rsi), %r11 > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %r11, -23(%rdi) > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_15bytes): > - mov -15(%rsi), %rcx > - mov -8(%rsi), %rdx > - mov %rcx, -15(%rdi) > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(write_7bytes): > - mov -7(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -7(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(large_page_fwd): > - movdqu (%rsi), %xmm1 > - lea 16(%rsi), %rsi > - movdqu %xmm0, (%r8) > - movntdq %xmm1, (%rdi) > - lea 16(%rdi), %rdi > - lea -0x90(%rdx), %rdx > -#ifdef USE_AS_MEMMOVE > - mov %rsi, %r9 > - sub %rdi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_fwd) > - shl $2, %rcx > - cmp %rcx, %rdx > - jb L(ll_cache_copy_fwd_start) > -L(memmove_is_memcpy_fwd): > -#endif > -L(large_page_loop): > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - movntdq %xmm4, 0x40(%rdi) > - movntdq %xmm5, 0x50(%rdi) > - movntdq %xmm6, 0x60(%rdi) > - movntdq %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - jae L(large_page_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_less_64bytes) > - > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - lea 0x40(%rsi), %rsi > - > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - lea 0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_less_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - sfence > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#ifdef USE_AS_MEMMOVE > - .p2align 4 > -L(ll_cache_copy_fwd_start): > - prefetcht0 0x1c0(%rsi) > - prefetcht0 0x200(%rsi) > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lea 0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movaps %xmm0, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - movaps %xmm4, 0x40(%rdi) > - movaps %xmm5, 0x50(%rdi) > - movaps %xmm6, 0x60(%rdi) > - movaps %xmm7, 0x70(%rdi) > - lea 0x80(%rdi), %rdi > - jae L(ll_cache_copy_fwd_start) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_ll_less_fwd_64bytes) > - > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - lea 0x40(%rsi), %rsi > - > - movaps %xmm0, (%rdi) > - movaps %xmm1, 0x10(%rdi) > - movaps %xmm2, 0x20(%rdi) > - movaps %xmm3, 0x30(%rdi) > - lea 0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_ll_less_fwd_64bytes): > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#endif > - .p2align 4 > -L(large_page_bwd): > - movdqu -0x10(%rsi), %xmm1 > - lea -16(%rsi), %rsi > - movdqu %xmm0, (%r8) > - movdqa %xmm1, -0x10(%rdi) > - lea -16(%rdi), %rdi > - lea -0x90(%rdx), %rdx > -#ifdef USE_AS_MEMMOVE > - mov %rdi, %r9 > - sub %rsi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_bwd) > - cmp %rcx, %r9 > - jb L(ll_cache_copy_bwd_start) > -L(memmove_is_memcpy_bwd): > -#endif > -L(large_page_bwd_loop): > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - movdqu -0x50(%rsi), %xmm4 > - movdqu -0x60(%rsi), %xmm5 > - movdqu -0x70(%rsi), %xmm6 > - movdqu -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movntdq %xmm0, -0x10(%rdi) > - movntdq %xmm1, -0x20(%rdi) > - movntdq %xmm2, -0x30(%rdi) > - movntdq %xmm3, -0x40(%rdi) > - movntdq %xmm4, -0x50(%rdi) > - movntdq %xmm5, -0x60(%rdi) > - movntdq %xmm6, -0x70(%rdi) > - movntdq %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - jae L(large_page_bwd_loop) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_less_bwd_64bytes) > - > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - lea -0x40(%rsi), %rsi > - > - movntdq %xmm0, -0x10(%rdi) > - movntdq %xmm1, -0x20(%rdi) > - movntdq %xmm2, -0x30(%rdi) > - movntdq %xmm3, -0x40(%rdi) > - lea -0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_less_bwd_64bytes): > - sfence > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > - > -#ifdef USE_AS_MEMMOVE > - .p2align 4 > -L(ll_cache_copy_bwd_start): > - prefetcht0 -0x1c0(%rsi) > - prefetcht0 -0x200(%rsi) > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - movdqu -0x50(%rsi), %xmm4 > - movdqu -0x60(%rsi), %xmm5 > - movdqu -0x70(%rsi), %xmm6 > - movdqu -0x80(%rsi), %xmm7 > - lea -0x80(%rsi), %rsi > - > - sub $0x80, %rdx > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - movaps %xmm4, -0x50(%rdi) > - movaps %xmm5, -0x60(%rdi) > - movaps %xmm6, -0x70(%rdi) > - movaps %xmm7, -0x80(%rdi) > - lea -0x80(%rdi), %rdi > - jae L(ll_cache_copy_bwd_start) > - cmp $-0x40, %rdx > - lea 0x80(%rdx), %rdx > - jl L(large_page_ll_less_bwd_64bytes) > - > - movdqu -0x10(%rsi), %xmm0 > - movdqu -0x20(%rsi), %xmm1 > - movdqu -0x30(%rsi), %xmm2 > - movdqu -0x40(%rsi), %xmm3 > - lea -0x40(%rsi), %rsi > - > - movaps %xmm0, -0x10(%rdi) > - movaps %xmm1, -0x20(%rdi) > - movaps %xmm2, -0x30(%rdi) > - movaps %xmm3, -0x40(%rdi) > - lea -0x40(%rdi), %rdi > - sub $0x40, %rdx > -L(large_page_ll_less_bwd_64bytes): > - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4) > -#endif > - > -END (MEMCPY) > - > - .section .rodata.ssse3,"a",@progbits > - .p2align 3 > -L(table_less_80bytes): > - .int JMPTBL (L(write_0bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_1bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_2bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_3bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_4bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_5bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_6bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_7bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_8bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_9bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_10bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_11bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_12bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_13bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_14bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_15bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_16bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_17bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_18bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_19bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_20bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_21bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_22bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_23bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_24bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_25bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_26bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_27bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_28bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_29bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_30bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_31bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_32bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_33bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_34bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_35bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_36bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_37bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_38bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_39bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_40bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_41bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_42bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_43bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_44bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_45bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_46bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_47bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_48bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_49bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_50bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_51bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_52bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_53bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_54bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_55bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_56bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_57bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_58bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_59bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_60bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_61bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_62bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_63bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_64bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_65bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_66bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_67bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_68bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_69bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_70bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_71bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_72bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_73bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_74bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_75bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_76bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_77bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_78bytes), L(table_less_80bytes)) > - .int JMPTBL (L(write_79bytes), L(table_less_80bytes)) > - > - .p2align 3 > -L(shl_table): > - .int JMPTBL (L(shl_0), L(shl_table)) > - .int JMPTBL (L(shl_1), L(shl_table)) > - .int JMPTBL (L(shl_2), L(shl_table)) > - .int JMPTBL (L(shl_3), L(shl_table)) > - .int JMPTBL (L(shl_4), L(shl_table)) > - .int JMPTBL (L(shl_5), L(shl_table)) > - .int JMPTBL (L(shl_6), L(shl_table)) > - .int JMPTBL (L(shl_7), L(shl_table)) > - .int JMPTBL (L(shl_8), L(shl_table)) > - .int JMPTBL (L(shl_9), L(shl_table)) > - .int JMPTBL (L(shl_10), L(shl_table)) > - .int JMPTBL (L(shl_11), L(shl_table)) > - .int JMPTBL (L(shl_12), L(shl_table)) > - .int JMPTBL (L(shl_13), L(shl_table)) > - .int JMPTBL (L(shl_14), L(shl_table)) > - .int JMPTBL (L(shl_15), L(shl_table)) > - > - .p2align 3 > -L(shl_table_bwd): > - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) > - > -#endif > diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S > deleted file mode 100644 > index 295430b1ef..0000000000 > --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_MEMMOVE > -#define MEMCPY __memmove_ssse3 > -#define MEMCPY_CHK __memmove_chk_ssse3 > -#include "memcpy-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein @ 2022-03-25 18:36 ` Noah Goldstein 2022-03-25 19:56 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein ` (3 subsequent siblings) 6 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 - sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - 5 files changed, 3209 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 48f81711ae..323be3b969 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -16,14 +16,12 @@ sysdep_routines += \ memcmpeq-avx2-rtm \ memcmpeq-evex \ memcmpeq-sse2 \ - memcpy-ssse3-back \ memmove-avx-unaligned-erms \ memmove-avx-unaligned-erms-rtm \ memmove-avx512-no-vzeroupper \ memmove-avx512-unaligned-erms \ memmove-evex-unaligned-erms \ memmove-sse2-unaligned-erms \ - memmove-ssse3-back \ memrchr-avx2 \ memrchr-avx2-rtm \ memrchr-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 70b0e9c62e..d6852ab365 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memmove_chk, CPU_FEATURE_USABLE (AVX512VL), __memmove_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memmove_chk, - CPU_FEATURE_USABLE (SSSE3), - __memmove_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, __memmove_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, @@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (AVX512VL), __memmove_avx512_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), - __memmove_ssse3_back) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_sse2_unaligned) @@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __memcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __memcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __memcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __memcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, __memcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, @@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512VL), __memcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), - __memcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (AVX512F), __memcpy_avx512_no_vzeroupper) @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, __mempcpy_chk, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_chk_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, - CPU_FEATURE_USABLE (SSSE3), - __mempcpy_chk_ssse3_back) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, __mempcpy_chk_sse2_unaligned) IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, @@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (AVX512VL), __mempcpy_evex_unaligned_erms) - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), - __mempcpy_ssse3_back) IFUNC_IMPL_ADD (array, i, mempcpy, 1, __mempcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, mempcpy, 1, diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h index 1ecdd4b0d3..5596ddea2c 100644 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h @@ -92,13 +92,6 @@ IFUNC_SELECTOR (void) } } - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) - && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) - { - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) - return OPTIMIZE (ssse3_back); - } - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) return OPTIMIZE (sse2_unaligned_erms); diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S deleted file mode 100644 index 92cfbf7933..0000000000 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +++ /dev/null @@ -1,3181 +0,0 @@ -/* memcpy with SSSE3 and REP string - Copyright (C) 2010-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#include <sysdep.h> - -#if IS_IN (libc) - -#include "asm-syntax.h" - -#ifndef MEMCPY -# define MEMCPY __memcpy_ssse3_back -# define MEMCPY_CHK __memcpy_chk_ssse3_back -# define MEMPCPY __mempcpy_ssse3_back -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back -#endif - -#define JMPTBL(I, B) I - B - -/* Branch to an entry in a jump table. TABLE is a jump table with - relative offsets. INDEX is a register contains the index into the - jump table. SCALE is the scale of INDEX. */ -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ - lea TABLE(%rip), %r11; \ - movslq (%r11, INDEX, SCALE), INDEX; \ - lea (%r11, INDEX), INDEX; \ - _CET_NOTRACK jmp *INDEX; \ - ud2 - - .section .text.ssse3,"ax",@progbits -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE -ENTRY (MEMPCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMPCPY_CHK) - -ENTRY (MEMPCPY) - mov %RDI_LP, %RAX_LP - add %RDX_LP, %RAX_LP - jmp L(start) -END (MEMPCPY) -#endif - -#if !defined USE_AS_BCOPY -ENTRY (MEMCPY_CHK) - cmp %RDX_LP, %RCX_LP - jb HIDDEN_JUMPTARGET (__chk_fail) -END (MEMCPY_CHK) -#endif - -ENTRY (MEMCPY) - mov %RDI_LP, %RAX_LP -#ifdef USE_AS_MEMPCPY - add %RDX_LP, %RAX_LP -#endif - -#ifdef __ILP32__ - /* Clear the upper 32 bits. */ - mov %edx, %edx -#endif - -#ifdef USE_AS_MEMMOVE - cmp %rsi, %rdi - jb L(copy_forward) - je L(bwd_write_0bytes) - cmp $144, %rdx - jae L(copy_backward) - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -L(copy_forward): -#endif -L(start): - cmp $144, %rdx - jae L(144bytesormore) - -L(fwd_write_less32bytes): -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jbe L(bk_write) -#endif - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) -#ifndef USE_AS_MEMMOVE -L(bk_write): - - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) -#endif - - .p2align 4 -L(144bytesormore): - -#ifndef USE_AS_MEMMOVE - cmp %dil, %sil - jle L(copy_backward) -#endif - movdqu (%rsi), %xmm0 - mov %rdi, %r8 - and $-16, %rdi - add $16, %rdi - mov %rdi, %r9 - sub %r8, %r9 - sub %r9, %rdx - add %r9, %rsi - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0) -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - cmp %rcx, %rdx - jae L(gobble_mem_fwd) - lea L(shl_table_fwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(copy_backward): -#ifdef DATA_CACHE_SIZE - mov $DATA_CACHE_SIZE, %RCX_LP -#else - mov __x86_data_cache_size(%rip), %RCX_LP -#endif - shl $1, %rcx - cmp %rcx, %rdx - ja L(gobble_mem_bwd) - - add %rdx, %rdi - add %rdx, %rsi - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $0xf, %r9 - xor %r9, %rdi - sub %r9, %rsi - sub %r9, %rdx - mov %rsi, %r9 - and $0xf, %r9 - jz L(shl_0_bwd) - lea L(shl_table_bwd)(%rip), %r11 - sub $0x80, %rdx - movslq (%r11, %r9, 4), %r9 - add %r11, %r9 - _CET_NOTRACK jmp *%r9 - ud2 - - .p2align 4 -L(shl_0): - - mov %rdx, %r9 - shr $8, %r9 - add %rdx, %r9 -#ifdef DATA_CACHE_SIZE - cmp $DATA_CACHE_SIZE_HALF, %R9_LP -#else - cmp __x86_data_cache_size_half(%rip), %R9_LP -#endif - jae L(gobble_mem_fwd) - sub $0x80, %rdx - .p2align 4 -L(shl_0_loop): - movdqa (%rsi), %xmm1 - movdqa %xmm1, (%rdi) - movaps 0x10(%rsi), %xmm2 - movaps %xmm2, 0x10(%rdi) - movaps 0x20(%rsi), %xmm3 - movaps %xmm3, 0x20(%rdi) - movaps 0x30(%rsi), %xmm4 - movaps %xmm4, 0x30(%rdi) - movaps 0x40(%rsi), %xmm1 - movaps %xmm1, 0x40(%rdi) - movaps 0x50(%rsi), %xmm2 - movaps %xmm2, 0x50(%rdi) - movaps 0x60(%rsi), %xmm3 - movaps %xmm3, 0x60(%rdi) - movaps 0x70(%rsi), %xmm4 - movaps %xmm4, 0x70(%rdi) - sub $0x80, %rdx - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(shl_0_loop) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_0_bwd): - sub $0x80, %rdx -L(copy_backward_loop): - movaps -0x10(%rsi), %xmm1 - movaps %xmm1, -0x10(%rdi) - movaps -0x20(%rsi), %xmm2 - movaps %xmm2, -0x20(%rdi) - movaps -0x30(%rsi), %xmm3 - movaps %xmm3, -0x30(%rdi) - movaps -0x40(%rsi), %xmm4 - movaps %xmm4, -0x40(%rdi) - movaps -0x50(%rsi), %xmm5 - movaps %xmm5, -0x50(%rdi) - movaps -0x60(%rsi), %xmm5 - movaps %xmm5, -0x60(%rdi) - movaps -0x70(%rsi), %xmm5 - movaps %xmm5, -0x70(%rdi) - movaps -0x80(%rsi), %xmm5 - movaps %xmm5, -0x80(%rdi) - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(copy_backward_loop) - - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_1): - sub $0x80, %rdx - movaps -0x01(%rsi), %xmm1 - movaps 0x0f(%rsi), %xmm2 - movaps 0x1f(%rsi), %xmm3 - movaps 0x2f(%rsi), %xmm4 - movaps 0x3f(%rsi), %xmm5 - movaps 0x4f(%rsi), %xmm6 - movaps 0x5f(%rsi), %xmm7 - movaps 0x6f(%rsi), %xmm8 - movaps 0x7f(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $1, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $1, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $1, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $1, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $1, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $1, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $1, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_1) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_1_bwd): - movaps -0x01(%rsi), %xmm1 - - movaps -0x11(%rsi), %xmm2 - palignr $1, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x21(%rsi), %xmm3 - palignr $1, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x31(%rsi), %xmm4 - palignr $1, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x41(%rsi), %xmm5 - palignr $1, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x51(%rsi), %xmm6 - palignr $1, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x61(%rsi), %xmm7 - palignr $1, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x71(%rsi), %xmm8 - palignr $1, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x81(%rsi), %xmm9 - palignr $1, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_1_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_2): - sub $0x80, %rdx - movaps -0x02(%rsi), %xmm1 - movaps 0x0e(%rsi), %xmm2 - movaps 0x1e(%rsi), %xmm3 - movaps 0x2e(%rsi), %xmm4 - movaps 0x3e(%rsi), %xmm5 - movaps 0x4e(%rsi), %xmm6 - movaps 0x5e(%rsi), %xmm7 - movaps 0x6e(%rsi), %xmm8 - movaps 0x7e(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $2, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $2, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $2, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $2, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $2, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $2, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $2, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_2) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_2_bwd): - movaps -0x02(%rsi), %xmm1 - - movaps -0x12(%rsi), %xmm2 - palignr $2, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x22(%rsi), %xmm3 - palignr $2, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x32(%rsi), %xmm4 - palignr $2, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x42(%rsi), %xmm5 - palignr $2, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x52(%rsi), %xmm6 - palignr $2, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x62(%rsi), %xmm7 - palignr $2, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x72(%rsi), %xmm8 - palignr $2, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x82(%rsi), %xmm9 - palignr $2, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_2_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_3): - sub $0x80, %rdx - movaps -0x03(%rsi), %xmm1 - movaps 0x0d(%rsi), %xmm2 - movaps 0x1d(%rsi), %xmm3 - movaps 0x2d(%rsi), %xmm4 - movaps 0x3d(%rsi), %xmm5 - movaps 0x4d(%rsi), %xmm6 - movaps 0x5d(%rsi), %xmm7 - movaps 0x6d(%rsi), %xmm8 - movaps 0x7d(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $3, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $3, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $3, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $3, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $3, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $3, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $3, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_3) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_3_bwd): - movaps -0x03(%rsi), %xmm1 - - movaps -0x13(%rsi), %xmm2 - palignr $3, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x23(%rsi), %xmm3 - palignr $3, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x33(%rsi), %xmm4 - palignr $3, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x43(%rsi), %xmm5 - palignr $3, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x53(%rsi), %xmm6 - palignr $3, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x63(%rsi), %xmm7 - palignr $3, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x73(%rsi), %xmm8 - palignr $3, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x83(%rsi), %xmm9 - palignr $3, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_3_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_4): - sub $0x80, %rdx - movaps -0x04(%rsi), %xmm1 - movaps 0x0c(%rsi), %xmm2 - movaps 0x1c(%rsi), %xmm3 - movaps 0x2c(%rsi), %xmm4 - movaps 0x3c(%rsi), %xmm5 - movaps 0x4c(%rsi), %xmm6 - movaps 0x5c(%rsi), %xmm7 - movaps 0x6c(%rsi), %xmm8 - movaps 0x7c(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $4, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $4, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $4, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $4, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $4, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $4, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $4, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_4) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_4_bwd): - movaps -0x04(%rsi), %xmm1 - - movaps -0x14(%rsi), %xmm2 - palignr $4, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x24(%rsi), %xmm3 - palignr $4, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x34(%rsi), %xmm4 - palignr $4, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x44(%rsi), %xmm5 - palignr $4, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x54(%rsi), %xmm6 - palignr $4, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x64(%rsi), %xmm7 - palignr $4, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x74(%rsi), %xmm8 - palignr $4, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x84(%rsi), %xmm9 - palignr $4, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_4_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_5): - sub $0x80, %rdx - movaps -0x05(%rsi), %xmm1 - movaps 0x0b(%rsi), %xmm2 - movaps 0x1b(%rsi), %xmm3 - movaps 0x2b(%rsi), %xmm4 - movaps 0x3b(%rsi), %xmm5 - movaps 0x4b(%rsi), %xmm6 - movaps 0x5b(%rsi), %xmm7 - movaps 0x6b(%rsi), %xmm8 - movaps 0x7b(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $5, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $5, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $5, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $5, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $5, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $5, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $5, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_5) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_5_bwd): - movaps -0x05(%rsi), %xmm1 - - movaps -0x15(%rsi), %xmm2 - palignr $5, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x25(%rsi), %xmm3 - palignr $5, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x35(%rsi), %xmm4 - palignr $5, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x45(%rsi), %xmm5 - palignr $5, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x55(%rsi), %xmm6 - palignr $5, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x65(%rsi), %xmm7 - palignr $5, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x75(%rsi), %xmm8 - palignr $5, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x85(%rsi), %xmm9 - palignr $5, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_5_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_6): - sub $0x80, %rdx - movaps -0x06(%rsi), %xmm1 - movaps 0x0a(%rsi), %xmm2 - movaps 0x1a(%rsi), %xmm3 - movaps 0x2a(%rsi), %xmm4 - movaps 0x3a(%rsi), %xmm5 - movaps 0x4a(%rsi), %xmm6 - movaps 0x5a(%rsi), %xmm7 - movaps 0x6a(%rsi), %xmm8 - movaps 0x7a(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $6, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $6, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $6, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $6, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $6, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $6, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $6, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_6) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_6_bwd): - movaps -0x06(%rsi), %xmm1 - - movaps -0x16(%rsi), %xmm2 - palignr $6, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x26(%rsi), %xmm3 - palignr $6, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x36(%rsi), %xmm4 - palignr $6, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x46(%rsi), %xmm5 - palignr $6, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x56(%rsi), %xmm6 - palignr $6, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x66(%rsi), %xmm7 - palignr $6, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x76(%rsi), %xmm8 - palignr $6, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x86(%rsi), %xmm9 - palignr $6, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_6_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_7): - sub $0x80, %rdx - movaps -0x07(%rsi), %xmm1 - movaps 0x09(%rsi), %xmm2 - movaps 0x19(%rsi), %xmm3 - movaps 0x29(%rsi), %xmm4 - movaps 0x39(%rsi), %xmm5 - movaps 0x49(%rsi), %xmm6 - movaps 0x59(%rsi), %xmm7 - movaps 0x69(%rsi), %xmm8 - movaps 0x79(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $7, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $7, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $7, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $7, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $7, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $7, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $7, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_7) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_7_bwd): - movaps -0x07(%rsi), %xmm1 - - movaps -0x17(%rsi), %xmm2 - palignr $7, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x27(%rsi), %xmm3 - palignr $7, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x37(%rsi), %xmm4 - palignr $7, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x47(%rsi), %xmm5 - palignr $7, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x57(%rsi), %xmm6 - palignr $7, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x67(%rsi), %xmm7 - palignr $7, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x77(%rsi), %xmm8 - palignr $7, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x87(%rsi), %xmm9 - palignr $7, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_7_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_8): - sub $0x80, %rdx - movaps -0x08(%rsi), %xmm1 - movaps 0x08(%rsi), %xmm2 - movaps 0x18(%rsi), %xmm3 - movaps 0x28(%rsi), %xmm4 - movaps 0x38(%rsi), %xmm5 - movaps 0x48(%rsi), %xmm6 - movaps 0x58(%rsi), %xmm7 - movaps 0x68(%rsi), %xmm8 - movaps 0x78(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $8, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $8, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $8, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $8, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $8, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $8, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $8, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_8) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_8_bwd): - movaps -0x08(%rsi), %xmm1 - - movaps -0x18(%rsi), %xmm2 - palignr $8, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x28(%rsi), %xmm3 - palignr $8, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x38(%rsi), %xmm4 - palignr $8, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x48(%rsi), %xmm5 - palignr $8, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x58(%rsi), %xmm6 - palignr $8, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x68(%rsi), %xmm7 - palignr $8, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x78(%rsi), %xmm8 - palignr $8, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x88(%rsi), %xmm9 - palignr $8, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_8_bwd) -L(shl_8_end_bwd): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_9): - sub $0x80, %rdx - movaps -0x09(%rsi), %xmm1 - movaps 0x07(%rsi), %xmm2 - movaps 0x17(%rsi), %xmm3 - movaps 0x27(%rsi), %xmm4 - movaps 0x37(%rsi), %xmm5 - movaps 0x47(%rsi), %xmm6 - movaps 0x57(%rsi), %xmm7 - movaps 0x67(%rsi), %xmm8 - movaps 0x77(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $9, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $9, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $9, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $9, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $9, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $9, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $9, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_9) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_9_bwd): - movaps -0x09(%rsi), %xmm1 - - movaps -0x19(%rsi), %xmm2 - palignr $9, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x29(%rsi), %xmm3 - palignr $9, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x39(%rsi), %xmm4 - palignr $9, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x49(%rsi), %xmm5 - palignr $9, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x59(%rsi), %xmm6 - palignr $9, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x69(%rsi), %xmm7 - palignr $9, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x79(%rsi), %xmm8 - palignr $9, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x89(%rsi), %xmm9 - palignr $9, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_9_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_10): - sub $0x80, %rdx - movaps -0x0a(%rsi), %xmm1 - movaps 0x06(%rsi), %xmm2 - movaps 0x16(%rsi), %xmm3 - movaps 0x26(%rsi), %xmm4 - movaps 0x36(%rsi), %xmm5 - movaps 0x46(%rsi), %xmm6 - movaps 0x56(%rsi), %xmm7 - movaps 0x66(%rsi), %xmm8 - movaps 0x76(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $10, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $10, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $10, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $10, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $10, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $10, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $10, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_10) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_10_bwd): - movaps -0x0a(%rsi), %xmm1 - - movaps -0x1a(%rsi), %xmm2 - palignr $10, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2a(%rsi), %xmm3 - palignr $10, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3a(%rsi), %xmm4 - palignr $10, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4a(%rsi), %xmm5 - palignr $10, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5a(%rsi), %xmm6 - palignr $10, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6a(%rsi), %xmm7 - palignr $10, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7a(%rsi), %xmm8 - palignr $10, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8a(%rsi), %xmm9 - palignr $10, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_10_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_11): - sub $0x80, %rdx - movaps -0x0b(%rsi), %xmm1 - movaps 0x05(%rsi), %xmm2 - movaps 0x15(%rsi), %xmm3 - movaps 0x25(%rsi), %xmm4 - movaps 0x35(%rsi), %xmm5 - movaps 0x45(%rsi), %xmm6 - movaps 0x55(%rsi), %xmm7 - movaps 0x65(%rsi), %xmm8 - movaps 0x75(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $11, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $11, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $11, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $11, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $11, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $11, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $11, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_11) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_11_bwd): - movaps -0x0b(%rsi), %xmm1 - - movaps -0x1b(%rsi), %xmm2 - palignr $11, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2b(%rsi), %xmm3 - palignr $11, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3b(%rsi), %xmm4 - palignr $11, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4b(%rsi), %xmm5 - palignr $11, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5b(%rsi), %xmm6 - palignr $11, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6b(%rsi), %xmm7 - palignr $11, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7b(%rsi), %xmm8 - palignr $11, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8b(%rsi), %xmm9 - palignr $11, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_11_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_12): - sub $0x80, %rdx - movdqa -0x0c(%rsi), %xmm1 - movaps 0x04(%rsi), %xmm2 - movaps 0x14(%rsi), %xmm3 - movaps 0x24(%rsi), %xmm4 - movaps 0x34(%rsi), %xmm5 - movaps 0x44(%rsi), %xmm6 - movaps 0x54(%rsi), %xmm7 - movaps 0x64(%rsi), %xmm8 - movaps 0x74(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $12, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $12, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $12, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $12, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $12, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $12, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $12, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - - lea 0x80(%rdi), %rdi - jae L(shl_12) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_12_bwd): - movaps -0x0c(%rsi), %xmm1 - - movaps -0x1c(%rsi), %xmm2 - palignr $12, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2c(%rsi), %xmm3 - palignr $12, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3c(%rsi), %xmm4 - palignr $12, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4c(%rsi), %xmm5 - palignr $12, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5c(%rsi), %xmm6 - palignr $12, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6c(%rsi), %xmm7 - palignr $12, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7c(%rsi), %xmm8 - palignr $12, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8c(%rsi), %xmm9 - palignr $12, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_12_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_13): - sub $0x80, %rdx - movaps -0x0d(%rsi), %xmm1 - movaps 0x03(%rsi), %xmm2 - movaps 0x13(%rsi), %xmm3 - movaps 0x23(%rsi), %xmm4 - movaps 0x33(%rsi), %xmm5 - movaps 0x43(%rsi), %xmm6 - movaps 0x53(%rsi), %xmm7 - movaps 0x63(%rsi), %xmm8 - movaps 0x73(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $13, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $13, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $13, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $13, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $13, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $13, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $13, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_13) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_13_bwd): - movaps -0x0d(%rsi), %xmm1 - - movaps -0x1d(%rsi), %xmm2 - palignr $13, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2d(%rsi), %xmm3 - palignr $13, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3d(%rsi), %xmm4 - palignr $13, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4d(%rsi), %xmm5 - palignr $13, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5d(%rsi), %xmm6 - palignr $13, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6d(%rsi), %xmm7 - palignr $13, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7d(%rsi), %xmm8 - palignr $13, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8d(%rsi), %xmm9 - palignr $13, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_13_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_14): - sub $0x80, %rdx - movaps -0x0e(%rsi), %xmm1 - movaps 0x02(%rsi), %xmm2 - movaps 0x12(%rsi), %xmm3 - movaps 0x22(%rsi), %xmm4 - movaps 0x32(%rsi), %xmm5 - movaps 0x42(%rsi), %xmm6 - movaps 0x52(%rsi), %xmm7 - movaps 0x62(%rsi), %xmm8 - movaps 0x72(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $14, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $14, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $14, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $14, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $14, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $14, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $14, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_14) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_14_bwd): - movaps -0x0e(%rsi), %xmm1 - - movaps -0x1e(%rsi), %xmm2 - palignr $14, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2e(%rsi), %xmm3 - palignr $14, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3e(%rsi), %xmm4 - palignr $14, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4e(%rsi), %xmm5 - palignr $14, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5e(%rsi), %xmm6 - palignr $14, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6e(%rsi), %xmm7 - palignr $14, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7e(%rsi), %xmm8 - palignr $14, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8e(%rsi), %xmm9 - palignr $14, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_14_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(shl_15): - sub $0x80, %rdx - movaps -0x0f(%rsi), %xmm1 - movaps 0x01(%rsi), %xmm2 - movaps 0x11(%rsi), %xmm3 - movaps 0x21(%rsi), %xmm4 - movaps 0x31(%rsi), %xmm5 - movaps 0x41(%rsi), %xmm6 - movaps 0x51(%rsi), %xmm7 - movaps 0x61(%rsi), %xmm8 - movaps 0x71(%rsi), %xmm9 - lea 0x80(%rsi), %rsi - palignr $15, %xmm8, %xmm9 - movaps %xmm9, 0x70(%rdi) - palignr $15, %xmm7, %xmm8 - movaps %xmm8, 0x60(%rdi) - palignr $15, %xmm6, %xmm7 - movaps %xmm7, 0x50(%rdi) - palignr $15, %xmm5, %xmm6 - movaps %xmm6, 0x40(%rdi) - palignr $15, %xmm4, %xmm5 - movaps %xmm5, 0x30(%rdi) - palignr $15, %xmm3, %xmm4 - movaps %xmm4, 0x20(%rdi) - palignr $15, %xmm2, %xmm3 - movaps %xmm3, 0x10(%rdi) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdi) - lea 0x80(%rdi), %rdi - jae L(shl_15) - movdqu %xmm0, (%r8) - add $0x80, %rdx - add %rdx, %rdi - add %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(shl_15_bwd): - movaps -0x0f(%rsi), %xmm1 - - movaps -0x1f(%rsi), %xmm2 - palignr $15, %xmm2, %xmm1 - movaps %xmm1, -0x10(%rdi) - - movaps -0x2f(%rsi), %xmm3 - palignr $15, %xmm3, %xmm2 - movaps %xmm2, -0x20(%rdi) - - movaps -0x3f(%rsi), %xmm4 - palignr $15, %xmm4, %xmm3 - movaps %xmm3, -0x30(%rdi) - - movaps -0x4f(%rsi), %xmm5 - palignr $15, %xmm5, %xmm4 - movaps %xmm4, -0x40(%rdi) - - movaps -0x5f(%rsi), %xmm6 - palignr $15, %xmm6, %xmm5 - movaps %xmm5, -0x50(%rdi) - - movaps -0x6f(%rsi), %xmm7 - palignr $15, %xmm7, %xmm6 - movaps %xmm6, -0x60(%rdi) - - movaps -0x7f(%rsi), %xmm8 - palignr $15, %xmm8, %xmm7 - movaps %xmm7, -0x70(%rdi) - - movaps -0x8f(%rsi), %xmm9 - palignr $15, %xmm9, %xmm8 - movaps %xmm8, -0x80(%rdi) - - sub $0x80, %rdx - lea -0x80(%rdi), %rdi - lea -0x80(%rsi), %rsi - jae L(shl_15_bwd) - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rdi - sub %rdx, %rsi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_fwd): - movdqu (%rsi), %xmm1 - movdqu %xmm0, (%r8) - movdqa %xmm1, (%rdi) - sub $16, %rdx - add $16, %rsi - add $16, %rdi - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rsi, %r9 - sub %rdi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_fwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_fwd_start) -L(memmove_is_memcpy_fwd): -#endif - cmp %rcx, %rdx - ja L(bigger_in_fwd) - mov %rdx, %rcx -L(bigger_in_fwd): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy_fwd) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy_fwd) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy_fwd): - sub $0x80, %rdx -L(gobble_mem_fwd_loop): - sub $0x80, %rdx - prefetcht0 0x200(%rsi) - prefetcht0 0x300(%rsi) - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - lfence - movntdq %xmm0, (%rdi) - movntdq %xmm1, 0x10(%rdi) - movntdq %xmm2, 0x20(%rdi) - movntdq %xmm3, 0x30(%rdi) - movntdq %xmm4, 0x40(%rdi) - movntdq %xmm5, 0x50(%rdi) - movntdq %xmm6, 0x60(%rdi) - movntdq %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_mem_fwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_fwd_end) - add $0x80, %rdx -L(ll_cache_copy_fwd): - add %rcx, %rdx -L(ll_cache_copy_fwd_start): - sub $0x80, %rdx -L(gobble_ll_loop_fwd): - prefetchnta 0x1c0(%rsi) - prefetchnta 0x280(%rsi) - prefetchnta 0x1c0(%rdi) - prefetchnta 0x280(%rdi) - sub $0x80, %rdx - movdqu (%rsi), %xmm0 - movdqu 0x10(%rsi), %xmm1 - movdqu 0x20(%rsi), %xmm2 - movdqu 0x30(%rsi), %xmm3 - movdqu 0x40(%rsi), %xmm4 - movdqu 0x50(%rsi), %xmm5 - movdqu 0x60(%rsi), %xmm6 - movdqu 0x70(%rsi), %xmm7 - movdqa %xmm0, (%rdi) - movdqa %xmm1, 0x10(%rdi) - movdqa %xmm2, 0x20(%rdi) - movdqa %xmm3, 0x30(%rdi) - movdqa %xmm4, 0x40(%rdi) - movdqa %xmm5, 0x50(%rdi) - movdqa %xmm6, 0x60(%rdi) - movdqa %xmm7, 0x70(%rdi) - lea 0x80(%rsi), %rsi - lea 0x80(%rdi), %rdi - jae L(gobble_ll_loop_fwd) -L(gobble_mem_fwd_end): - add $0x80, %rdx - add %rdx, %rsi - add %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) - - .p2align 4 -L(gobble_mem_bwd): - add %rdx, %rsi - add %rdx, %rdi - - movdqu -16(%rsi), %xmm0 - lea -16(%rdi), %r8 - mov %rdi, %r9 - and $-16, %rdi - sub %rdi, %r9 - sub %r9, %rsi - sub %r9, %rdx - - -#ifdef SHARED_CACHE_SIZE_HALF - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP -#else - mov __x86_shared_cache_size_half(%rip), %RCX_LP -#endif -#ifdef USE_AS_MEMMOVE - mov %rdi, %r9 - sub %rsi, %r9 - cmp %rdx, %r9 - jae L(memmove_is_memcpy_bwd) - cmp %rcx, %r9 - jbe L(ll_cache_copy_bwd_start) -L(memmove_is_memcpy_bwd): -#endif - cmp %rcx, %rdx - ja L(bigger) - mov %rdx, %rcx -L(bigger): - sub %rcx, %rdx - cmp $0x1000, %rdx - jbe L(ll_cache_copy) - - mov %rcx, %r9 - shl $3, %r9 - cmp %r9, %rdx - jbe L(2steps_copy) - add %rcx, %rdx - xor %rcx, %rcx -L(2steps_copy): - sub $0x80, %rdx -L(gobble_mem_bwd_loop): - sub $0x80, %rdx - prefetcht0 -0x200(%rsi) - prefetcht0 -0x300(%rsi) - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - lfence - movntdq %xmm1, -0x10(%rdi) - movntdq %xmm2, -0x20(%rdi) - movntdq %xmm3, -0x30(%rdi) - movntdq %xmm4, -0x40(%rdi) - movntdq %xmm5, -0x50(%rdi) - movntdq %xmm6, -0x60(%rdi) - movntdq %xmm7, -0x70(%rdi) - movntdq %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_mem_bwd_loop) - sfence - cmp $0x80, %rcx - jb L(gobble_mem_bwd_end) - add $0x80, %rdx -L(ll_cache_copy): - add %rcx, %rdx -L(ll_cache_copy_bwd_start): - sub $0x80, %rdx -L(gobble_ll_loop): - prefetchnta -0x1c0(%rsi) - prefetchnta -0x280(%rsi) - prefetchnta -0x1c0(%rdi) - prefetchnta -0x280(%rdi) - sub $0x80, %rdx - movdqu -0x10(%rsi), %xmm1 - movdqu -0x20(%rsi), %xmm2 - movdqu -0x30(%rsi), %xmm3 - movdqu -0x40(%rsi), %xmm4 - movdqu -0x50(%rsi), %xmm5 - movdqu -0x60(%rsi), %xmm6 - movdqu -0x70(%rsi), %xmm7 - movdqu -0x80(%rsi), %xmm8 - movdqa %xmm1, -0x10(%rdi) - movdqa %xmm2, -0x20(%rdi) - movdqa %xmm3, -0x30(%rdi) - movdqa %xmm4, -0x40(%rdi) - movdqa %xmm5, -0x50(%rdi) - movdqa %xmm6, -0x60(%rdi) - movdqa %xmm7, -0x70(%rdi) - movdqa %xmm8, -0x80(%rdi) - lea -0x80(%rsi), %rsi - lea -0x80(%rdi), %rdi - jae L(gobble_ll_loop) -L(gobble_mem_bwd_end): - movdqu %xmm0, (%r8) - add $0x80, %rdx - sub %rdx, %rsi - sub %rdx, %rdi - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) - - .p2align 4 -L(fwd_write_128bytes): - lddqu -128(%rsi), %xmm0 - movdqu %xmm0, -128(%rdi) -L(fwd_write_112bytes): - lddqu -112(%rsi), %xmm0 - movdqu %xmm0, -112(%rdi) -L(fwd_write_96bytes): - lddqu -96(%rsi), %xmm0 - movdqu %xmm0, -96(%rdi) -L(fwd_write_80bytes): - lddqu -80(%rsi), %xmm0 - movdqu %xmm0, -80(%rdi) -L(fwd_write_64bytes): - lddqu -64(%rsi), %xmm0 - movdqu %xmm0, -64(%rdi) -L(fwd_write_48bytes): - lddqu -48(%rsi), %xmm0 - movdqu %xmm0, -48(%rdi) -L(fwd_write_32bytes): - lddqu -32(%rsi), %xmm0 - movdqu %xmm0, -32(%rdi) -L(fwd_write_16bytes): - lddqu -16(%rsi), %xmm0 - movdqu %xmm0, -16(%rdi) -L(fwd_write_0bytes): - ret - - - .p2align 4 -L(fwd_write_143bytes): - lddqu -143(%rsi), %xmm0 - movdqu %xmm0, -143(%rdi) -L(fwd_write_127bytes): - lddqu -127(%rsi), %xmm0 - movdqu %xmm0, -127(%rdi) -L(fwd_write_111bytes): - lddqu -111(%rsi), %xmm0 - movdqu %xmm0, -111(%rdi) -L(fwd_write_95bytes): - lddqu -95(%rsi), %xmm0 - movdqu %xmm0, -95(%rdi) -L(fwd_write_79bytes): - lddqu -79(%rsi), %xmm0 - movdqu %xmm0, -79(%rdi) -L(fwd_write_63bytes): - lddqu -63(%rsi), %xmm0 - movdqu %xmm0, -63(%rdi) -L(fwd_write_47bytes): - lddqu -47(%rsi), %xmm0 - movdqu %xmm0, -47(%rdi) -L(fwd_write_31bytes): - lddqu -31(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -31(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_15bytes): - mov -15(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -15(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_142bytes): - lddqu -142(%rsi), %xmm0 - movdqu %xmm0, -142(%rdi) -L(fwd_write_126bytes): - lddqu -126(%rsi), %xmm0 - movdqu %xmm0, -126(%rdi) -L(fwd_write_110bytes): - lddqu -110(%rsi), %xmm0 - movdqu %xmm0, -110(%rdi) -L(fwd_write_94bytes): - lddqu -94(%rsi), %xmm0 - movdqu %xmm0, -94(%rdi) -L(fwd_write_78bytes): - lddqu -78(%rsi), %xmm0 - movdqu %xmm0, -78(%rdi) -L(fwd_write_62bytes): - lddqu -62(%rsi), %xmm0 - movdqu %xmm0, -62(%rdi) -L(fwd_write_46bytes): - lddqu -46(%rsi), %xmm0 - movdqu %xmm0, -46(%rdi) -L(fwd_write_30bytes): - lddqu -30(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -30(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_14bytes): - mov -14(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -14(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_141bytes): - lddqu -141(%rsi), %xmm0 - movdqu %xmm0, -141(%rdi) -L(fwd_write_125bytes): - lddqu -125(%rsi), %xmm0 - movdqu %xmm0, -125(%rdi) -L(fwd_write_109bytes): - lddqu -109(%rsi), %xmm0 - movdqu %xmm0, -109(%rdi) -L(fwd_write_93bytes): - lddqu -93(%rsi), %xmm0 - movdqu %xmm0, -93(%rdi) -L(fwd_write_77bytes): - lddqu -77(%rsi), %xmm0 - movdqu %xmm0, -77(%rdi) -L(fwd_write_61bytes): - lddqu -61(%rsi), %xmm0 - movdqu %xmm0, -61(%rdi) -L(fwd_write_45bytes): - lddqu -45(%rsi), %xmm0 - movdqu %xmm0, -45(%rdi) -L(fwd_write_29bytes): - lddqu -29(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -29(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_13bytes): - mov -13(%rsi), %rdx - mov -8(%rsi), %rcx - mov %rdx, -13(%rdi) - mov %rcx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_140bytes): - lddqu -140(%rsi), %xmm0 - movdqu %xmm0, -140(%rdi) -L(fwd_write_124bytes): - lddqu -124(%rsi), %xmm0 - movdqu %xmm0, -124(%rdi) -L(fwd_write_108bytes): - lddqu -108(%rsi), %xmm0 - movdqu %xmm0, -108(%rdi) -L(fwd_write_92bytes): - lddqu -92(%rsi), %xmm0 - movdqu %xmm0, -92(%rdi) -L(fwd_write_76bytes): - lddqu -76(%rsi), %xmm0 - movdqu %xmm0, -76(%rdi) -L(fwd_write_60bytes): - lddqu -60(%rsi), %xmm0 - movdqu %xmm0, -60(%rdi) -L(fwd_write_44bytes): - lddqu -44(%rsi), %xmm0 - movdqu %xmm0, -44(%rdi) -L(fwd_write_28bytes): - lddqu -28(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -28(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_12bytes): - mov -12(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -12(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_139bytes): - lddqu -139(%rsi), %xmm0 - movdqu %xmm0, -139(%rdi) -L(fwd_write_123bytes): - lddqu -123(%rsi), %xmm0 - movdqu %xmm0, -123(%rdi) -L(fwd_write_107bytes): - lddqu -107(%rsi), %xmm0 - movdqu %xmm0, -107(%rdi) -L(fwd_write_91bytes): - lddqu -91(%rsi), %xmm0 - movdqu %xmm0, -91(%rdi) -L(fwd_write_75bytes): - lddqu -75(%rsi), %xmm0 - movdqu %xmm0, -75(%rdi) -L(fwd_write_59bytes): - lddqu -59(%rsi), %xmm0 - movdqu %xmm0, -59(%rdi) -L(fwd_write_43bytes): - lddqu -43(%rsi), %xmm0 - movdqu %xmm0, -43(%rdi) -L(fwd_write_27bytes): - lddqu -27(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -27(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_11bytes): - mov -11(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -11(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_138bytes): - lddqu -138(%rsi), %xmm0 - movdqu %xmm0, -138(%rdi) -L(fwd_write_122bytes): - lddqu -122(%rsi), %xmm0 - movdqu %xmm0, -122(%rdi) -L(fwd_write_106bytes): - lddqu -106(%rsi), %xmm0 - movdqu %xmm0, -106(%rdi) -L(fwd_write_90bytes): - lddqu -90(%rsi), %xmm0 - movdqu %xmm0, -90(%rdi) -L(fwd_write_74bytes): - lddqu -74(%rsi), %xmm0 - movdqu %xmm0, -74(%rdi) -L(fwd_write_58bytes): - lddqu -58(%rsi), %xmm0 - movdqu %xmm0, -58(%rdi) -L(fwd_write_42bytes): - lddqu -42(%rsi), %xmm0 - movdqu %xmm0, -42(%rdi) -L(fwd_write_26bytes): - lddqu -26(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -26(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_10bytes): - mov -10(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -10(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_137bytes): - lddqu -137(%rsi), %xmm0 - movdqu %xmm0, -137(%rdi) -L(fwd_write_121bytes): - lddqu -121(%rsi), %xmm0 - movdqu %xmm0, -121(%rdi) -L(fwd_write_105bytes): - lddqu -105(%rsi), %xmm0 - movdqu %xmm0, -105(%rdi) -L(fwd_write_89bytes): - lddqu -89(%rsi), %xmm0 - movdqu %xmm0, -89(%rdi) -L(fwd_write_73bytes): - lddqu -73(%rsi), %xmm0 - movdqu %xmm0, -73(%rdi) -L(fwd_write_57bytes): - lddqu -57(%rsi), %xmm0 - movdqu %xmm0, -57(%rdi) -L(fwd_write_41bytes): - lddqu -41(%rsi), %xmm0 - movdqu %xmm0, -41(%rdi) -L(fwd_write_25bytes): - lddqu -25(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -25(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_9bytes): - mov -9(%rsi), %rdx - mov -4(%rsi), %ecx - mov %rdx, -9(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_136bytes): - lddqu -136(%rsi), %xmm0 - movdqu %xmm0, -136(%rdi) -L(fwd_write_120bytes): - lddqu -120(%rsi), %xmm0 - movdqu %xmm0, -120(%rdi) -L(fwd_write_104bytes): - lddqu -104(%rsi), %xmm0 - movdqu %xmm0, -104(%rdi) -L(fwd_write_88bytes): - lddqu -88(%rsi), %xmm0 - movdqu %xmm0, -88(%rdi) -L(fwd_write_72bytes): - lddqu -72(%rsi), %xmm0 - movdqu %xmm0, -72(%rdi) -L(fwd_write_56bytes): - lddqu -56(%rsi), %xmm0 - movdqu %xmm0, -56(%rdi) -L(fwd_write_40bytes): - lddqu -40(%rsi), %xmm0 - movdqu %xmm0, -40(%rdi) -L(fwd_write_24bytes): - lddqu -24(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -24(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_8bytes): - mov -8(%rsi), %rdx - mov %rdx, -8(%rdi) - ret - - .p2align 4 -L(fwd_write_135bytes): - lddqu -135(%rsi), %xmm0 - movdqu %xmm0, -135(%rdi) -L(fwd_write_119bytes): - lddqu -119(%rsi), %xmm0 - movdqu %xmm0, -119(%rdi) -L(fwd_write_103bytes): - lddqu -103(%rsi), %xmm0 - movdqu %xmm0, -103(%rdi) -L(fwd_write_87bytes): - lddqu -87(%rsi), %xmm0 - movdqu %xmm0, -87(%rdi) -L(fwd_write_71bytes): - lddqu -71(%rsi), %xmm0 - movdqu %xmm0, -71(%rdi) -L(fwd_write_55bytes): - lddqu -55(%rsi), %xmm0 - movdqu %xmm0, -55(%rdi) -L(fwd_write_39bytes): - lddqu -39(%rsi), %xmm0 - movdqu %xmm0, -39(%rdi) -L(fwd_write_23bytes): - lddqu -23(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -23(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_7bytes): - mov -7(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -7(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_134bytes): - lddqu -134(%rsi), %xmm0 - movdqu %xmm0, -134(%rdi) -L(fwd_write_118bytes): - lddqu -118(%rsi), %xmm0 - movdqu %xmm0, -118(%rdi) -L(fwd_write_102bytes): - lddqu -102(%rsi), %xmm0 - movdqu %xmm0, -102(%rdi) -L(fwd_write_86bytes): - lddqu -86(%rsi), %xmm0 - movdqu %xmm0, -86(%rdi) -L(fwd_write_70bytes): - lddqu -70(%rsi), %xmm0 - movdqu %xmm0, -70(%rdi) -L(fwd_write_54bytes): - lddqu -54(%rsi), %xmm0 - movdqu %xmm0, -54(%rdi) -L(fwd_write_38bytes): - lddqu -38(%rsi), %xmm0 - movdqu %xmm0, -38(%rdi) -L(fwd_write_22bytes): - lddqu -22(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -22(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_6bytes): - mov -6(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -6(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_133bytes): - lddqu -133(%rsi), %xmm0 - movdqu %xmm0, -133(%rdi) -L(fwd_write_117bytes): - lddqu -117(%rsi), %xmm0 - movdqu %xmm0, -117(%rdi) -L(fwd_write_101bytes): - lddqu -101(%rsi), %xmm0 - movdqu %xmm0, -101(%rdi) -L(fwd_write_85bytes): - lddqu -85(%rsi), %xmm0 - movdqu %xmm0, -85(%rdi) -L(fwd_write_69bytes): - lddqu -69(%rsi), %xmm0 - movdqu %xmm0, -69(%rdi) -L(fwd_write_53bytes): - lddqu -53(%rsi), %xmm0 - movdqu %xmm0, -53(%rdi) -L(fwd_write_37bytes): - lddqu -37(%rsi), %xmm0 - movdqu %xmm0, -37(%rdi) -L(fwd_write_21bytes): - lddqu -21(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -21(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_5bytes): - mov -5(%rsi), %edx - mov -4(%rsi), %ecx - mov %edx, -5(%rdi) - mov %ecx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_132bytes): - lddqu -132(%rsi), %xmm0 - movdqu %xmm0, -132(%rdi) -L(fwd_write_116bytes): - lddqu -116(%rsi), %xmm0 - movdqu %xmm0, -116(%rdi) -L(fwd_write_100bytes): - lddqu -100(%rsi), %xmm0 - movdqu %xmm0, -100(%rdi) -L(fwd_write_84bytes): - lddqu -84(%rsi), %xmm0 - movdqu %xmm0, -84(%rdi) -L(fwd_write_68bytes): - lddqu -68(%rsi), %xmm0 - movdqu %xmm0, -68(%rdi) -L(fwd_write_52bytes): - lddqu -52(%rsi), %xmm0 - movdqu %xmm0, -52(%rdi) -L(fwd_write_36bytes): - lddqu -36(%rsi), %xmm0 - movdqu %xmm0, -36(%rdi) -L(fwd_write_20bytes): - lddqu -20(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -20(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_4bytes): - mov -4(%rsi), %edx - mov %edx, -4(%rdi) - ret - - .p2align 4 -L(fwd_write_131bytes): - lddqu -131(%rsi), %xmm0 - movdqu %xmm0, -131(%rdi) -L(fwd_write_115bytes): - lddqu -115(%rsi), %xmm0 - movdqu %xmm0, -115(%rdi) -L(fwd_write_99bytes): - lddqu -99(%rsi), %xmm0 - movdqu %xmm0, -99(%rdi) -L(fwd_write_83bytes): - lddqu -83(%rsi), %xmm0 - movdqu %xmm0, -83(%rdi) -L(fwd_write_67bytes): - lddqu -67(%rsi), %xmm0 - movdqu %xmm0, -67(%rdi) -L(fwd_write_51bytes): - lddqu -51(%rsi), %xmm0 - movdqu %xmm0, -51(%rdi) -L(fwd_write_35bytes): - lddqu -35(%rsi), %xmm0 - movdqu %xmm0, -35(%rdi) -L(fwd_write_19bytes): - lddqu -19(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -19(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_3bytes): - mov -3(%rsi), %dx - mov -2(%rsi), %cx - mov %dx, -3(%rdi) - mov %cx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_130bytes): - lddqu -130(%rsi), %xmm0 - movdqu %xmm0, -130(%rdi) -L(fwd_write_114bytes): - lddqu -114(%rsi), %xmm0 - movdqu %xmm0, -114(%rdi) -L(fwd_write_98bytes): - lddqu -98(%rsi), %xmm0 - movdqu %xmm0, -98(%rdi) -L(fwd_write_82bytes): - lddqu -82(%rsi), %xmm0 - movdqu %xmm0, -82(%rdi) -L(fwd_write_66bytes): - lddqu -66(%rsi), %xmm0 - movdqu %xmm0, -66(%rdi) -L(fwd_write_50bytes): - lddqu -50(%rsi), %xmm0 - movdqu %xmm0, -50(%rdi) -L(fwd_write_34bytes): - lddqu -34(%rsi), %xmm0 - movdqu %xmm0, -34(%rdi) -L(fwd_write_18bytes): - lddqu -18(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -18(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_2bytes): - movzwl -2(%rsi), %edx - mov %dx, -2(%rdi) - ret - - .p2align 4 -L(fwd_write_129bytes): - lddqu -129(%rsi), %xmm0 - movdqu %xmm0, -129(%rdi) -L(fwd_write_113bytes): - lddqu -113(%rsi), %xmm0 - movdqu %xmm0, -113(%rdi) -L(fwd_write_97bytes): - lddqu -97(%rsi), %xmm0 - movdqu %xmm0, -97(%rdi) -L(fwd_write_81bytes): - lddqu -81(%rsi), %xmm0 - movdqu %xmm0, -81(%rdi) -L(fwd_write_65bytes): - lddqu -65(%rsi), %xmm0 - movdqu %xmm0, -65(%rdi) -L(fwd_write_49bytes): - lddqu -49(%rsi), %xmm0 - movdqu %xmm0, -49(%rdi) -L(fwd_write_33bytes): - lddqu -33(%rsi), %xmm0 - movdqu %xmm0, -33(%rdi) -L(fwd_write_17bytes): - lddqu -17(%rsi), %xmm0 - lddqu -16(%rsi), %xmm1 - movdqu %xmm0, -17(%rdi) - movdqu %xmm1, -16(%rdi) - ret - - .p2align 4 -L(fwd_write_1bytes): - movzbl -1(%rsi), %edx - mov %dl, -1(%rdi) - ret - - .p2align 4 -L(bwd_write_128bytes): - lddqu 112(%rsi), %xmm0 - movdqu %xmm0, 112(%rdi) -L(bwd_write_112bytes): - lddqu 96(%rsi), %xmm0 - movdqu %xmm0, 96(%rdi) -L(bwd_write_96bytes): - lddqu 80(%rsi), %xmm0 - movdqu %xmm0, 80(%rdi) -L(bwd_write_80bytes): - lddqu 64(%rsi), %xmm0 - movdqu %xmm0, 64(%rdi) -L(bwd_write_64bytes): - lddqu 48(%rsi), %xmm0 - movdqu %xmm0, 48(%rdi) -L(bwd_write_48bytes): - lddqu 32(%rsi), %xmm0 - movdqu %xmm0, 32(%rdi) -L(bwd_write_32bytes): - lddqu 16(%rsi), %xmm0 - movdqu %xmm0, 16(%rdi) -L(bwd_write_16bytes): - lddqu (%rsi), %xmm0 - movdqu %xmm0, (%rdi) -L(bwd_write_0bytes): - ret - - .p2align 4 -L(bwd_write_143bytes): - lddqu 127(%rsi), %xmm0 - movdqu %xmm0, 127(%rdi) -L(bwd_write_127bytes): - lddqu 111(%rsi), %xmm0 - movdqu %xmm0, 111(%rdi) -L(bwd_write_111bytes): - lddqu 95(%rsi), %xmm0 - movdqu %xmm0, 95(%rdi) -L(bwd_write_95bytes): - lddqu 79(%rsi), %xmm0 - movdqu %xmm0, 79(%rdi) -L(bwd_write_79bytes): - lddqu 63(%rsi), %xmm0 - movdqu %xmm0, 63(%rdi) -L(bwd_write_63bytes): - lddqu 47(%rsi), %xmm0 - movdqu %xmm0, 47(%rdi) -L(bwd_write_47bytes): - lddqu 31(%rsi), %xmm0 - movdqu %xmm0, 31(%rdi) -L(bwd_write_31bytes): - lddqu 15(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 15(%rdi) - movdqu %xmm1, (%rdi) - ret - - - .p2align 4 -L(bwd_write_15bytes): - mov 7(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 7(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_142bytes): - lddqu 126(%rsi), %xmm0 - movdqu %xmm0, 126(%rdi) -L(bwd_write_126bytes): - lddqu 110(%rsi), %xmm0 - movdqu %xmm0, 110(%rdi) -L(bwd_write_110bytes): - lddqu 94(%rsi), %xmm0 - movdqu %xmm0, 94(%rdi) -L(bwd_write_94bytes): - lddqu 78(%rsi), %xmm0 - movdqu %xmm0, 78(%rdi) -L(bwd_write_78bytes): - lddqu 62(%rsi), %xmm0 - movdqu %xmm0, 62(%rdi) -L(bwd_write_62bytes): - lddqu 46(%rsi), %xmm0 - movdqu %xmm0, 46(%rdi) -L(bwd_write_46bytes): - lddqu 30(%rsi), %xmm0 - movdqu %xmm0, 30(%rdi) -L(bwd_write_30bytes): - lddqu 14(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 14(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_14bytes): - mov 6(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 6(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_141bytes): - lddqu 125(%rsi), %xmm0 - movdqu %xmm0, 125(%rdi) -L(bwd_write_125bytes): - lddqu 109(%rsi), %xmm0 - movdqu %xmm0, 109(%rdi) -L(bwd_write_109bytes): - lddqu 93(%rsi), %xmm0 - movdqu %xmm0, 93(%rdi) -L(bwd_write_93bytes): - lddqu 77(%rsi), %xmm0 - movdqu %xmm0, 77(%rdi) -L(bwd_write_77bytes): - lddqu 61(%rsi), %xmm0 - movdqu %xmm0, 61(%rdi) -L(bwd_write_61bytes): - lddqu 45(%rsi), %xmm0 - movdqu %xmm0, 45(%rdi) -L(bwd_write_45bytes): - lddqu 29(%rsi), %xmm0 - movdqu %xmm0, 29(%rdi) -L(bwd_write_29bytes): - lddqu 13(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 13(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_13bytes): - mov 5(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 5(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_140bytes): - lddqu 124(%rsi), %xmm0 - movdqu %xmm0, 124(%rdi) -L(bwd_write_124bytes): - lddqu 108(%rsi), %xmm0 - movdqu %xmm0, 108(%rdi) -L(bwd_write_108bytes): - lddqu 92(%rsi), %xmm0 - movdqu %xmm0, 92(%rdi) -L(bwd_write_92bytes): - lddqu 76(%rsi), %xmm0 - movdqu %xmm0, 76(%rdi) -L(bwd_write_76bytes): - lddqu 60(%rsi), %xmm0 - movdqu %xmm0, 60(%rdi) -L(bwd_write_60bytes): - lddqu 44(%rsi), %xmm0 - movdqu %xmm0, 44(%rdi) -L(bwd_write_44bytes): - lddqu 28(%rsi), %xmm0 - movdqu %xmm0, 28(%rdi) -L(bwd_write_28bytes): - lddqu 12(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 12(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_12bytes): - mov 4(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 4(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_139bytes): - lddqu 123(%rsi), %xmm0 - movdqu %xmm0, 123(%rdi) -L(bwd_write_123bytes): - lddqu 107(%rsi), %xmm0 - movdqu %xmm0, 107(%rdi) -L(bwd_write_107bytes): - lddqu 91(%rsi), %xmm0 - movdqu %xmm0, 91(%rdi) -L(bwd_write_91bytes): - lddqu 75(%rsi), %xmm0 - movdqu %xmm0, 75(%rdi) -L(bwd_write_75bytes): - lddqu 59(%rsi), %xmm0 - movdqu %xmm0, 59(%rdi) -L(bwd_write_59bytes): - lddqu 43(%rsi), %xmm0 - movdqu %xmm0, 43(%rdi) -L(bwd_write_43bytes): - lddqu 27(%rsi), %xmm0 - movdqu %xmm0, 27(%rdi) -L(bwd_write_27bytes): - lddqu 11(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 11(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_11bytes): - mov 3(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 3(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_138bytes): - lddqu 122(%rsi), %xmm0 - movdqu %xmm0, 122(%rdi) -L(bwd_write_122bytes): - lddqu 106(%rsi), %xmm0 - movdqu %xmm0, 106(%rdi) -L(bwd_write_106bytes): - lddqu 90(%rsi), %xmm0 - movdqu %xmm0, 90(%rdi) -L(bwd_write_90bytes): - lddqu 74(%rsi), %xmm0 - movdqu %xmm0, 74(%rdi) -L(bwd_write_74bytes): - lddqu 58(%rsi), %xmm0 - movdqu %xmm0, 58(%rdi) -L(bwd_write_58bytes): - lddqu 42(%rsi), %xmm0 - movdqu %xmm0, 42(%rdi) -L(bwd_write_42bytes): - lddqu 26(%rsi), %xmm0 - movdqu %xmm0, 26(%rdi) -L(bwd_write_26bytes): - lddqu 10(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 10(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_10bytes): - mov 2(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 2(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_137bytes): - lddqu 121(%rsi), %xmm0 - movdqu %xmm0, 121(%rdi) -L(bwd_write_121bytes): - lddqu 105(%rsi), %xmm0 - movdqu %xmm0, 105(%rdi) -L(bwd_write_105bytes): - lddqu 89(%rsi), %xmm0 - movdqu %xmm0, 89(%rdi) -L(bwd_write_89bytes): - lddqu 73(%rsi), %xmm0 - movdqu %xmm0, 73(%rdi) -L(bwd_write_73bytes): - lddqu 57(%rsi), %xmm0 - movdqu %xmm0, 57(%rdi) -L(bwd_write_57bytes): - lddqu 41(%rsi), %xmm0 - movdqu %xmm0, 41(%rdi) -L(bwd_write_41bytes): - lddqu 25(%rsi), %xmm0 - movdqu %xmm0, 25(%rdi) -L(bwd_write_25bytes): - lddqu 9(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 9(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_9bytes): - mov 1(%rsi), %rdx - mov (%rsi), %rcx - mov %rdx, 1(%rdi) - mov %rcx, (%rdi) - ret - - .p2align 4 -L(bwd_write_136bytes): - lddqu 120(%rsi), %xmm0 - movdqu %xmm0, 120(%rdi) -L(bwd_write_120bytes): - lddqu 104(%rsi), %xmm0 - movdqu %xmm0, 104(%rdi) -L(bwd_write_104bytes): - lddqu 88(%rsi), %xmm0 - movdqu %xmm0, 88(%rdi) -L(bwd_write_88bytes): - lddqu 72(%rsi), %xmm0 - movdqu %xmm0, 72(%rdi) -L(bwd_write_72bytes): - lddqu 56(%rsi), %xmm0 - movdqu %xmm0, 56(%rdi) -L(bwd_write_56bytes): - lddqu 40(%rsi), %xmm0 - movdqu %xmm0, 40(%rdi) -L(bwd_write_40bytes): - lddqu 24(%rsi), %xmm0 - movdqu %xmm0, 24(%rdi) -L(bwd_write_24bytes): - lddqu 8(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 8(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_8bytes): - mov (%rsi), %rdx - mov %rdx, (%rdi) - ret - - .p2align 4 -L(bwd_write_135bytes): - lddqu 119(%rsi), %xmm0 - movdqu %xmm0, 119(%rdi) -L(bwd_write_119bytes): - lddqu 103(%rsi), %xmm0 - movdqu %xmm0, 103(%rdi) -L(bwd_write_103bytes): - lddqu 87(%rsi), %xmm0 - movdqu %xmm0, 87(%rdi) -L(bwd_write_87bytes): - lddqu 71(%rsi), %xmm0 - movdqu %xmm0, 71(%rdi) -L(bwd_write_71bytes): - lddqu 55(%rsi), %xmm0 - movdqu %xmm0, 55(%rdi) -L(bwd_write_55bytes): - lddqu 39(%rsi), %xmm0 - movdqu %xmm0, 39(%rdi) -L(bwd_write_39bytes): - lddqu 23(%rsi), %xmm0 - movdqu %xmm0, 23(%rdi) -L(bwd_write_23bytes): - lddqu 7(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 7(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_7bytes): - mov 3(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 3(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_134bytes): - lddqu 118(%rsi), %xmm0 - movdqu %xmm0, 118(%rdi) -L(bwd_write_118bytes): - lddqu 102(%rsi), %xmm0 - movdqu %xmm0, 102(%rdi) -L(bwd_write_102bytes): - lddqu 86(%rsi), %xmm0 - movdqu %xmm0, 86(%rdi) -L(bwd_write_86bytes): - lddqu 70(%rsi), %xmm0 - movdqu %xmm0, 70(%rdi) -L(bwd_write_70bytes): - lddqu 54(%rsi), %xmm0 - movdqu %xmm0, 54(%rdi) -L(bwd_write_54bytes): - lddqu 38(%rsi), %xmm0 - movdqu %xmm0, 38(%rdi) -L(bwd_write_38bytes): - lddqu 22(%rsi), %xmm0 - movdqu %xmm0, 22(%rdi) -L(bwd_write_22bytes): - lddqu 6(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 6(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_6bytes): - mov 2(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 2(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_133bytes): - lddqu 117(%rsi), %xmm0 - movdqu %xmm0, 117(%rdi) -L(bwd_write_117bytes): - lddqu 101(%rsi), %xmm0 - movdqu %xmm0, 101(%rdi) -L(bwd_write_101bytes): - lddqu 85(%rsi), %xmm0 - movdqu %xmm0, 85(%rdi) -L(bwd_write_85bytes): - lddqu 69(%rsi), %xmm0 - movdqu %xmm0, 69(%rdi) -L(bwd_write_69bytes): - lddqu 53(%rsi), %xmm0 - movdqu %xmm0, 53(%rdi) -L(bwd_write_53bytes): - lddqu 37(%rsi), %xmm0 - movdqu %xmm0, 37(%rdi) -L(bwd_write_37bytes): - lddqu 21(%rsi), %xmm0 - movdqu %xmm0, 21(%rdi) -L(bwd_write_21bytes): - lddqu 5(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 5(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_5bytes): - mov 1(%rsi), %edx - mov (%rsi), %ecx - mov %edx, 1(%rdi) - mov %ecx, (%rdi) - ret - - .p2align 4 -L(bwd_write_132bytes): - lddqu 116(%rsi), %xmm0 - movdqu %xmm0, 116(%rdi) -L(bwd_write_116bytes): - lddqu 100(%rsi), %xmm0 - movdqu %xmm0, 100(%rdi) -L(bwd_write_100bytes): - lddqu 84(%rsi), %xmm0 - movdqu %xmm0, 84(%rdi) -L(bwd_write_84bytes): - lddqu 68(%rsi), %xmm0 - movdqu %xmm0, 68(%rdi) -L(bwd_write_68bytes): - lddqu 52(%rsi), %xmm0 - movdqu %xmm0, 52(%rdi) -L(bwd_write_52bytes): - lddqu 36(%rsi), %xmm0 - movdqu %xmm0, 36(%rdi) -L(bwd_write_36bytes): - lddqu 20(%rsi), %xmm0 - movdqu %xmm0, 20(%rdi) -L(bwd_write_20bytes): - lddqu 4(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 4(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_4bytes): - mov (%rsi), %edx - mov %edx, (%rdi) - ret - - .p2align 4 -L(bwd_write_131bytes): - lddqu 115(%rsi), %xmm0 - movdqu %xmm0, 115(%rdi) -L(bwd_write_115bytes): - lddqu 99(%rsi), %xmm0 - movdqu %xmm0, 99(%rdi) -L(bwd_write_99bytes): - lddqu 83(%rsi), %xmm0 - movdqu %xmm0, 83(%rdi) -L(bwd_write_83bytes): - lddqu 67(%rsi), %xmm0 - movdqu %xmm0, 67(%rdi) -L(bwd_write_67bytes): - lddqu 51(%rsi), %xmm0 - movdqu %xmm0, 51(%rdi) -L(bwd_write_51bytes): - lddqu 35(%rsi), %xmm0 - movdqu %xmm0, 35(%rdi) -L(bwd_write_35bytes): - lddqu 19(%rsi), %xmm0 - movdqu %xmm0, 19(%rdi) -L(bwd_write_19bytes): - lddqu 3(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 3(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_3bytes): - mov 1(%rsi), %dx - mov (%rsi), %cx - mov %dx, 1(%rdi) - mov %cx, (%rdi) - ret - - .p2align 4 -L(bwd_write_130bytes): - lddqu 114(%rsi), %xmm0 - movdqu %xmm0, 114(%rdi) -L(bwd_write_114bytes): - lddqu 98(%rsi), %xmm0 - movdqu %xmm0, 98(%rdi) -L(bwd_write_98bytes): - lddqu 82(%rsi), %xmm0 - movdqu %xmm0, 82(%rdi) -L(bwd_write_82bytes): - lddqu 66(%rsi), %xmm0 - movdqu %xmm0, 66(%rdi) -L(bwd_write_66bytes): - lddqu 50(%rsi), %xmm0 - movdqu %xmm0, 50(%rdi) -L(bwd_write_50bytes): - lddqu 34(%rsi), %xmm0 - movdqu %xmm0, 34(%rdi) -L(bwd_write_34bytes): - lddqu 18(%rsi), %xmm0 - movdqu %xmm0, 18(%rdi) -L(bwd_write_18bytes): - lddqu 2(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 2(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_2bytes): - movzwl (%rsi), %edx - mov %dx, (%rdi) - ret - - .p2align 4 -L(bwd_write_129bytes): - lddqu 113(%rsi), %xmm0 - movdqu %xmm0, 113(%rdi) -L(bwd_write_113bytes): - lddqu 97(%rsi), %xmm0 - movdqu %xmm0, 97(%rdi) -L(bwd_write_97bytes): - lddqu 81(%rsi), %xmm0 - movdqu %xmm0, 81(%rdi) -L(bwd_write_81bytes): - lddqu 65(%rsi), %xmm0 - movdqu %xmm0, 65(%rdi) -L(bwd_write_65bytes): - lddqu 49(%rsi), %xmm0 - movdqu %xmm0, 49(%rdi) -L(bwd_write_49bytes): - lddqu 33(%rsi), %xmm0 - movdqu %xmm0, 33(%rdi) -L(bwd_write_33bytes): - lddqu 17(%rsi), %xmm0 - movdqu %xmm0, 17(%rdi) -L(bwd_write_17bytes): - lddqu 1(%rsi), %xmm0 - lddqu (%rsi), %xmm1 - movdqu %xmm0, 1(%rdi) - movdqu %xmm1, (%rdi) - ret - - .p2align 4 -L(bwd_write_1bytes): - movzbl (%rsi), %edx - mov %dl, (%rdi) - ret - -END (MEMCPY) - - .section .rodata.ssse3,"a",@progbits - .p2align 3 -L(table_144_bytes_bwd): - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) - - .p2align 3 -L(table_144_bytes_fwd): - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) - - .p2align 3 -L(shl_table_fwd): - .int JMPTBL (L(shl_0), L(shl_table_fwd)) - .int JMPTBL (L(shl_1), L(shl_table_fwd)) - .int JMPTBL (L(shl_2), L(shl_table_fwd)) - .int JMPTBL (L(shl_3), L(shl_table_fwd)) - .int JMPTBL (L(shl_4), L(shl_table_fwd)) - .int JMPTBL (L(shl_5), L(shl_table_fwd)) - .int JMPTBL (L(shl_6), L(shl_table_fwd)) - .int JMPTBL (L(shl_7), L(shl_table_fwd)) - .int JMPTBL (L(shl_8), L(shl_table_fwd)) - .int JMPTBL (L(shl_9), L(shl_table_fwd)) - .int JMPTBL (L(shl_10), L(shl_table_fwd)) - .int JMPTBL (L(shl_11), L(shl_table_fwd)) - .int JMPTBL (L(shl_12), L(shl_table_fwd)) - .int JMPTBL (L(shl_13), L(shl_table_fwd)) - .int JMPTBL (L(shl_14), L(shl_table_fwd)) - .int JMPTBL (L(shl_15), L(shl_table_fwd)) - - .p2align 3 -L(shl_table_bwd): - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) - -#endif diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S deleted file mode 100644 index f9a4e9aff9..0000000000 --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_MEMMOVE -#define MEMCPY __memmove_ssse3_back -#define MEMCPY_CHK __memmove_chk_ssse3_back -#include "memcpy-ssse3-back.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back 2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-03-25 19:56 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 - > sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 - > sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 ----------------- > sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 - > 5 files changed, 3209 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 48f81711ae..323be3b969 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -16,14 +16,12 @@ sysdep_routines += \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > memcmpeq-sse2 \ > - memcpy-ssse3-back \ > memmove-avx-unaligned-erms \ > memmove-avx-unaligned-erms-rtm \ > memmove-avx512-no-vzeroupper \ > memmove-avx512-unaligned-erms \ > memmove-evex-unaligned-erms \ > memmove-sse2-unaligned-erms \ > - memmove-ssse3-back \ > memrchr-avx2 \ > memrchr-avx2-rtm \ > memrchr-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 70b0e9c62e..d6852ab365 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memmove_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memmove_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memmove_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, > __memmove_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __memmove_chk, 1, > @@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memmove, > CPU_FEATURE_USABLE (AVX512VL), > __memmove_avx512_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), > - __memmove_ssse3_back) > IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms) > IFUNC_IMPL_ADD (array, i, memmove, 1, > __memmove_sse2_unaligned) > @@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __memcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __memcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __memcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, > __memcpy_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1, > @@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, memcpy, > CPU_FEATURE_USABLE (AVX512VL), > __memcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), > - __memcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, memcpy, > CPU_FEATURE_USABLE (AVX512F), > __memcpy_avx512_no_vzeroupper) > @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_chk_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, __mempcpy_chk, > - CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_chk_ssse3_back) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, > __mempcpy_chk_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1, > @@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > IFUNC_IMPL_ADD (array, i, mempcpy, > CPU_FEATURE_USABLE (AVX512VL), > __mempcpy_evex_unaligned_erms) > - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), > - __mempcpy_ssse3_back) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > __mempcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, mempcpy, 1, > diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h > index 1ecdd4b0d3..5596ddea2c 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h > @@ -92,13 +92,6 @@ IFUNC_SELECTOR (void) > } > } > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3) > - && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy)) > - { > - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward)) > - return OPTIMIZE (ssse3_back); > - } > - > if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) > return OPTIMIZE (sse2_unaligned_erms); > > diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > deleted file mode 100644 > index 92cfbf7933..0000000000 > --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S > +++ /dev/null > @@ -1,3181 +0,0 @@ > -/* memcpy with SSSE3 and REP string > - Copyright (C) 2010-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#include <sysdep.h> > - > -#if IS_IN (libc) > - > -#include "asm-syntax.h" > - > -#ifndef MEMCPY > -# define MEMCPY __memcpy_ssse3_back > -# define MEMCPY_CHK __memcpy_chk_ssse3_back > -# define MEMPCPY __mempcpy_ssse3_back > -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back > -#endif > - > -#define JMPTBL(I, B) I - B > - > -/* Branch to an entry in a jump table. TABLE is a jump table with > - relative offsets. INDEX is a register contains the index into the > - jump table. SCALE is the scale of INDEX. */ > -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ > - lea TABLE(%rip), %r11; \ > - movslq (%r11, INDEX, SCALE), INDEX; \ > - lea (%r11, INDEX), INDEX; \ > - _CET_NOTRACK jmp *INDEX; \ > - ud2 > - > - .section .text.ssse3,"ax",@progbits > -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE > -ENTRY (MEMPCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMPCPY_CHK) > - > -ENTRY (MEMPCPY) > - mov %RDI_LP, %RAX_LP > - add %RDX_LP, %RAX_LP > - jmp L(start) > -END (MEMPCPY) > -#endif > - > -#if !defined USE_AS_BCOPY > -ENTRY (MEMCPY_CHK) > - cmp %RDX_LP, %RCX_LP > - jb HIDDEN_JUMPTARGET (__chk_fail) > -END (MEMCPY_CHK) > -#endif > - > -ENTRY (MEMCPY) > - mov %RDI_LP, %RAX_LP > -#ifdef USE_AS_MEMPCPY > - add %RDX_LP, %RAX_LP > -#endif > - > -#ifdef __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -#endif > - > -#ifdef USE_AS_MEMMOVE > - cmp %rsi, %rdi > - jb L(copy_forward) > - je L(bwd_write_0bytes) > - cmp $144, %rdx > - jae L(copy_backward) > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -L(copy_forward): > -#endif > -L(start): > - cmp $144, %rdx > - jae L(144bytesormore) > - > -L(fwd_write_less32bytes): > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jbe L(bk_write) > -#endif > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > -#ifndef USE_AS_MEMMOVE > -L(bk_write): > - > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > -#endif > - > - .p2align 4 > -L(144bytesormore): > - > -#ifndef USE_AS_MEMMOVE > - cmp %dil, %sil > - jle L(copy_backward) > -#endif > - movdqu (%rsi), %xmm0 > - mov %rdi, %r8 > - and $-16, %rdi > - add $16, %rdi > - mov %rdi, %r9 > - sub %r8, %r9 > - sub %r9, %rdx > - add %r9, %rsi > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0) > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - cmp %rcx, %rdx > - jae L(gobble_mem_fwd) > - lea L(shl_table_fwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(copy_backward): > -#ifdef DATA_CACHE_SIZE > - mov $DATA_CACHE_SIZE, %RCX_LP > -#else > - mov __x86_data_cache_size(%rip), %RCX_LP > -#endif > - shl $1, %rcx > - cmp %rcx, %rdx > - ja L(gobble_mem_bwd) > - > - add %rdx, %rdi > - add %rdx, %rsi > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $0xf, %r9 > - xor %r9, %rdi > - sub %r9, %rsi > - sub %r9, %rdx > - mov %rsi, %r9 > - and $0xf, %r9 > - jz L(shl_0_bwd) > - lea L(shl_table_bwd)(%rip), %r11 > - sub $0x80, %rdx > - movslq (%r11, %r9, 4), %r9 > - add %r11, %r9 > - _CET_NOTRACK jmp *%r9 > - ud2 > - > - .p2align 4 > -L(shl_0): > - > - mov %rdx, %r9 > - shr $8, %r9 > - add %rdx, %r9 > -#ifdef DATA_CACHE_SIZE > - cmp $DATA_CACHE_SIZE_HALF, %R9_LP > -#else > - cmp __x86_data_cache_size_half(%rip), %R9_LP > -#endif > - jae L(gobble_mem_fwd) > - sub $0x80, %rdx > - .p2align 4 > -L(shl_0_loop): > - movdqa (%rsi), %xmm1 > - movdqa %xmm1, (%rdi) > - movaps 0x10(%rsi), %xmm2 > - movaps %xmm2, 0x10(%rdi) > - movaps 0x20(%rsi), %xmm3 > - movaps %xmm3, 0x20(%rdi) > - movaps 0x30(%rsi), %xmm4 > - movaps %xmm4, 0x30(%rdi) > - movaps 0x40(%rsi), %xmm1 > - movaps %xmm1, 0x40(%rdi) > - movaps 0x50(%rsi), %xmm2 > - movaps %xmm2, 0x50(%rdi) > - movaps 0x60(%rsi), %xmm3 > - movaps %xmm3, 0x60(%rdi) > - movaps 0x70(%rsi), %xmm4 > - movaps %xmm4, 0x70(%rdi) > - sub $0x80, %rdx > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(shl_0_loop) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_0_bwd): > - sub $0x80, %rdx > -L(copy_backward_loop): > - movaps -0x10(%rsi), %xmm1 > - movaps %xmm1, -0x10(%rdi) > - movaps -0x20(%rsi), %xmm2 > - movaps %xmm2, -0x20(%rdi) > - movaps -0x30(%rsi), %xmm3 > - movaps %xmm3, -0x30(%rdi) > - movaps -0x40(%rsi), %xmm4 > - movaps %xmm4, -0x40(%rdi) > - movaps -0x50(%rsi), %xmm5 > - movaps %xmm5, -0x50(%rdi) > - movaps -0x60(%rsi), %xmm5 > - movaps %xmm5, -0x60(%rdi) > - movaps -0x70(%rsi), %xmm5 > - movaps %xmm5, -0x70(%rdi) > - movaps -0x80(%rsi), %xmm5 > - movaps %xmm5, -0x80(%rdi) > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(copy_backward_loop) > - > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1): > - sub $0x80, %rdx > - movaps -0x01(%rsi), %xmm1 > - movaps 0x0f(%rsi), %xmm2 > - movaps 0x1f(%rsi), %xmm3 > - movaps 0x2f(%rsi), %xmm4 > - movaps 0x3f(%rsi), %xmm5 > - movaps 0x4f(%rsi), %xmm6 > - movaps 0x5f(%rsi), %xmm7 > - movaps 0x6f(%rsi), %xmm8 > - movaps 0x7f(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $1, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $1, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $1, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $1, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $1, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $1, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $1, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_1) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_1_bwd): > - movaps -0x01(%rsi), %xmm1 > - > - movaps -0x11(%rsi), %xmm2 > - palignr $1, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x21(%rsi), %xmm3 > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x31(%rsi), %xmm4 > - palignr $1, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x41(%rsi), %xmm5 > - palignr $1, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x51(%rsi), %xmm6 > - palignr $1, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x61(%rsi), %xmm7 > - palignr $1, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x71(%rsi), %xmm8 > - palignr $1, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x81(%rsi), %xmm9 > - palignr $1, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_1_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2): > - sub $0x80, %rdx > - movaps -0x02(%rsi), %xmm1 > - movaps 0x0e(%rsi), %xmm2 > - movaps 0x1e(%rsi), %xmm3 > - movaps 0x2e(%rsi), %xmm4 > - movaps 0x3e(%rsi), %xmm5 > - movaps 0x4e(%rsi), %xmm6 > - movaps 0x5e(%rsi), %xmm7 > - movaps 0x6e(%rsi), %xmm8 > - movaps 0x7e(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $2, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $2, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $2, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $2, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $2, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $2, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $2, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_2) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_2_bwd): > - movaps -0x02(%rsi), %xmm1 > - > - movaps -0x12(%rsi), %xmm2 > - palignr $2, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x22(%rsi), %xmm3 > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x32(%rsi), %xmm4 > - palignr $2, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x42(%rsi), %xmm5 > - palignr $2, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x52(%rsi), %xmm6 > - palignr $2, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x62(%rsi), %xmm7 > - palignr $2, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x72(%rsi), %xmm8 > - palignr $2, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x82(%rsi), %xmm9 > - palignr $2, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_2_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3): > - sub $0x80, %rdx > - movaps -0x03(%rsi), %xmm1 > - movaps 0x0d(%rsi), %xmm2 > - movaps 0x1d(%rsi), %xmm3 > - movaps 0x2d(%rsi), %xmm4 > - movaps 0x3d(%rsi), %xmm5 > - movaps 0x4d(%rsi), %xmm6 > - movaps 0x5d(%rsi), %xmm7 > - movaps 0x6d(%rsi), %xmm8 > - movaps 0x7d(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $3, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $3, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $3, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $3, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $3, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $3, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $3, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_3) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_3_bwd): > - movaps -0x03(%rsi), %xmm1 > - > - movaps -0x13(%rsi), %xmm2 > - palignr $3, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x23(%rsi), %xmm3 > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x33(%rsi), %xmm4 > - palignr $3, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x43(%rsi), %xmm5 > - palignr $3, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x53(%rsi), %xmm6 > - palignr $3, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x63(%rsi), %xmm7 > - palignr $3, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x73(%rsi), %xmm8 > - palignr $3, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x83(%rsi), %xmm9 > - palignr $3, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_3_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4): > - sub $0x80, %rdx > - movaps -0x04(%rsi), %xmm1 > - movaps 0x0c(%rsi), %xmm2 > - movaps 0x1c(%rsi), %xmm3 > - movaps 0x2c(%rsi), %xmm4 > - movaps 0x3c(%rsi), %xmm5 > - movaps 0x4c(%rsi), %xmm6 > - movaps 0x5c(%rsi), %xmm7 > - movaps 0x6c(%rsi), %xmm8 > - movaps 0x7c(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $4, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $4, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $4, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $4, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $4, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $4, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $4, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_4) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_4_bwd): > - movaps -0x04(%rsi), %xmm1 > - > - movaps -0x14(%rsi), %xmm2 > - palignr $4, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x24(%rsi), %xmm3 > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x34(%rsi), %xmm4 > - palignr $4, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x44(%rsi), %xmm5 > - palignr $4, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x54(%rsi), %xmm6 > - palignr $4, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x64(%rsi), %xmm7 > - palignr $4, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x74(%rsi), %xmm8 > - palignr $4, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x84(%rsi), %xmm9 > - palignr $4, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_4_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5): > - sub $0x80, %rdx > - movaps -0x05(%rsi), %xmm1 > - movaps 0x0b(%rsi), %xmm2 > - movaps 0x1b(%rsi), %xmm3 > - movaps 0x2b(%rsi), %xmm4 > - movaps 0x3b(%rsi), %xmm5 > - movaps 0x4b(%rsi), %xmm6 > - movaps 0x5b(%rsi), %xmm7 > - movaps 0x6b(%rsi), %xmm8 > - movaps 0x7b(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $5, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $5, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $5, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $5, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $5, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $5, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $5, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_5) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_5_bwd): > - movaps -0x05(%rsi), %xmm1 > - > - movaps -0x15(%rsi), %xmm2 > - palignr $5, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x25(%rsi), %xmm3 > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x35(%rsi), %xmm4 > - palignr $5, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x45(%rsi), %xmm5 > - palignr $5, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x55(%rsi), %xmm6 > - palignr $5, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x65(%rsi), %xmm7 > - palignr $5, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x75(%rsi), %xmm8 > - palignr $5, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x85(%rsi), %xmm9 > - palignr $5, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_5_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6): > - sub $0x80, %rdx > - movaps -0x06(%rsi), %xmm1 > - movaps 0x0a(%rsi), %xmm2 > - movaps 0x1a(%rsi), %xmm3 > - movaps 0x2a(%rsi), %xmm4 > - movaps 0x3a(%rsi), %xmm5 > - movaps 0x4a(%rsi), %xmm6 > - movaps 0x5a(%rsi), %xmm7 > - movaps 0x6a(%rsi), %xmm8 > - movaps 0x7a(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $6, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $6, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $6, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $6, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $6, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $6, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $6, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_6) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_6_bwd): > - movaps -0x06(%rsi), %xmm1 > - > - movaps -0x16(%rsi), %xmm2 > - palignr $6, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x26(%rsi), %xmm3 > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x36(%rsi), %xmm4 > - palignr $6, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x46(%rsi), %xmm5 > - palignr $6, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x56(%rsi), %xmm6 > - palignr $6, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x66(%rsi), %xmm7 > - palignr $6, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x76(%rsi), %xmm8 > - palignr $6, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x86(%rsi), %xmm9 > - palignr $6, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_6_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7): > - sub $0x80, %rdx > - movaps -0x07(%rsi), %xmm1 > - movaps 0x09(%rsi), %xmm2 > - movaps 0x19(%rsi), %xmm3 > - movaps 0x29(%rsi), %xmm4 > - movaps 0x39(%rsi), %xmm5 > - movaps 0x49(%rsi), %xmm6 > - movaps 0x59(%rsi), %xmm7 > - movaps 0x69(%rsi), %xmm8 > - movaps 0x79(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $7, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $7, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $7, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $7, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $7, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $7, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $7, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_7) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_7_bwd): > - movaps -0x07(%rsi), %xmm1 > - > - movaps -0x17(%rsi), %xmm2 > - palignr $7, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x27(%rsi), %xmm3 > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x37(%rsi), %xmm4 > - palignr $7, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x47(%rsi), %xmm5 > - palignr $7, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x57(%rsi), %xmm6 > - palignr $7, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x67(%rsi), %xmm7 > - palignr $7, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x77(%rsi), %xmm8 > - palignr $7, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x87(%rsi), %xmm9 > - palignr $7, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_7_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8): > - sub $0x80, %rdx > - movaps -0x08(%rsi), %xmm1 > - movaps 0x08(%rsi), %xmm2 > - movaps 0x18(%rsi), %xmm3 > - movaps 0x28(%rsi), %xmm4 > - movaps 0x38(%rsi), %xmm5 > - movaps 0x48(%rsi), %xmm6 > - movaps 0x58(%rsi), %xmm7 > - movaps 0x68(%rsi), %xmm8 > - movaps 0x78(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $8, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $8, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $8, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $8, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $8, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $8, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $8, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_8) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_8_bwd): > - movaps -0x08(%rsi), %xmm1 > - > - movaps -0x18(%rsi), %xmm2 > - palignr $8, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x28(%rsi), %xmm3 > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x38(%rsi), %xmm4 > - palignr $8, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x48(%rsi), %xmm5 > - palignr $8, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x58(%rsi), %xmm6 > - palignr $8, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x68(%rsi), %xmm7 > - palignr $8, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x78(%rsi), %xmm8 > - palignr $8, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x88(%rsi), %xmm9 > - palignr $8, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_8_bwd) > -L(shl_8_end_bwd): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9): > - sub $0x80, %rdx > - movaps -0x09(%rsi), %xmm1 > - movaps 0x07(%rsi), %xmm2 > - movaps 0x17(%rsi), %xmm3 > - movaps 0x27(%rsi), %xmm4 > - movaps 0x37(%rsi), %xmm5 > - movaps 0x47(%rsi), %xmm6 > - movaps 0x57(%rsi), %xmm7 > - movaps 0x67(%rsi), %xmm8 > - movaps 0x77(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $9, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $9, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $9, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $9, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $9, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $9, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $9, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_9) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_9_bwd): > - movaps -0x09(%rsi), %xmm1 > - > - movaps -0x19(%rsi), %xmm2 > - palignr $9, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x29(%rsi), %xmm3 > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x39(%rsi), %xmm4 > - palignr $9, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x49(%rsi), %xmm5 > - palignr $9, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x59(%rsi), %xmm6 > - palignr $9, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x69(%rsi), %xmm7 > - palignr $9, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x79(%rsi), %xmm8 > - palignr $9, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x89(%rsi), %xmm9 > - palignr $9, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_9_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10): > - sub $0x80, %rdx > - movaps -0x0a(%rsi), %xmm1 > - movaps 0x06(%rsi), %xmm2 > - movaps 0x16(%rsi), %xmm3 > - movaps 0x26(%rsi), %xmm4 > - movaps 0x36(%rsi), %xmm5 > - movaps 0x46(%rsi), %xmm6 > - movaps 0x56(%rsi), %xmm7 > - movaps 0x66(%rsi), %xmm8 > - movaps 0x76(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $10, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $10, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $10, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $10, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $10, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $10, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $10, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_10) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_10_bwd): > - movaps -0x0a(%rsi), %xmm1 > - > - movaps -0x1a(%rsi), %xmm2 > - palignr $10, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2a(%rsi), %xmm3 > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3a(%rsi), %xmm4 > - palignr $10, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4a(%rsi), %xmm5 > - palignr $10, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5a(%rsi), %xmm6 > - palignr $10, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6a(%rsi), %xmm7 > - palignr $10, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7a(%rsi), %xmm8 > - palignr $10, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8a(%rsi), %xmm9 > - palignr $10, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_10_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11): > - sub $0x80, %rdx > - movaps -0x0b(%rsi), %xmm1 > - movaps 0x05(%rsi), %xmm2 > - movaps 0x15(%rsi), %xmm3 > - movaps 0x25(%rsi), %xmm4 > - movaps 0x35(%rsi), %xmm5 > - movaps 0x45(%rsi), %xmm6 > - movaps 0x55(%rsi), %xmm7 > - movaps 0x65(%rsi), %xmm8 > - movaps 0x75(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $11, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $11, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $11, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $11, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $11, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $11, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $11, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_11) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_11_bwd): > - movaps -0x0b(%rsi), %xmm1 > - > - movaps -0x1b(%rsi), %xmm2 > - palignr $11, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2b(%rsi), %xmm3 > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3b(%rsi), %xmm4 > - palignr $11, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4b(%rsi), %xmm5 > - palignr $11, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5b(%rsi), %xmm6 > - palignr $11, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6b(%rsi), %xmm7 > - palignr $11, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7b(%rsi), %xmm8 > - palignr $11, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8b(%rsi), %xmm9 > - palignr $11, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_11_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12): > - sub $0x80, %rdx > - movdqa -0x0c(%rsi), %xmm1 > - movaps 0x04(%rsi), %xmm2 > - movaps 0x14(%rsi), %xmm3 > - movaps 0x24(%rsi), %xmm4 > - movaps 0x34(%rsi), %xmm5 > - movaps 0x44(%rsi), %xmm6 > - movaps 0x54(%rsi), %xmm7 > - movaps 0x64(%rsi), %xmm8 > - movaps 0x74(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $12, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $12, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $12, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $12, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $12, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $12, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $12, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - > - lea 0x80(%rdi), %rdi > - jae L(shl_12) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_12_bwd): > - movaps -0x0c(%rsi), %xmm1 > - > - movaps -0x1c(%rsi), %xmm2 > - palignr $12, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2c(%rsi), %xmm3 > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3c(%rsi), %xmm4 > - palignr $12, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4c(%rsi), %xmm5 > - palignr $12, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5c(%rsi), %xmm6 > - palignr $12, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6c(%rsi), %xmm7 > - palignr $12, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7c(%rsi), %xmm8 > - palignr $12, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8c(%rsi), %xmm9 > - palignr $12, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_12_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13): > - sub $0x80, %rdx > - movaps -0x0d(%rsi), %xmm1 > - movaps 0x03(%rsi), %xmm2 > - movaps 0x13(%rsi), %xmm3 > - movaps 0x23(%rsi), %xmm4 > - movaps 0x33(%rsi), %xmm5 > - movaps 0x43(%rsi), %xmm6 > - movaps 0x53(%rsi), %xmm7 > - movaps 0x63(%rsi), %xmm8 > - movaps 0x73(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $13, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $13, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $13, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $13, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $13, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $13, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $13, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_13) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_13_bwd): > - movaps -0x0d(%rsi), %xmm1 > - > - movaps -0x1d(%rsi), %xmm2 > - palignr $13, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2d(%rsi), %xmm3 > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3d(%rsi), %xmm4 > - palignr $13, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4d(%rsi), %xmm5 > - palignr $13, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5d(%rsi), %xmm6 > - palignr $13, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6d(%rsi), %xmm7 > - palignr $13, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7d(%rsi), %xmm8 > - palignr $13, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8d(%rsi), %xmm9 > - palignr $13, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_13_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14): > - sub $0x80, %rdx > - movaps -0x0e(%rsi), %xmm1 > - movaps 0x02(%rsi), %xmm2 > - movaps 0x12(%rsi), %xmm3 > - movaps 0x22(%rsi), %xmm4 > - movaps 0x32(%rsi), %xmm5 > - movaps 0x42(%rsi), %xmm6 > - movaps 0x52(%rsi), %xmm7 > - movaps 0x62(%rsi), %xmm8 > - movaps 0x72(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $14, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $14, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $14, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $14, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $14, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $14, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $14, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_14) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_14_bwd): > - movaps -0x0e(%rsi), %xmm1 > - > - movaps -0x1e(%rsi), %xmm2 > - palignr $14, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2e(%rsi), %xmm3 > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3e(%rsi), %xmm4 > - palignr $14, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4e(%rsi), %xmm5 > - palignr $14, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5e(%rsi), %xmm6 > - palignr $14, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6e(%rsi), %xmm7 > - palignr $14, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7e(%rsi), %xmm8 > - palignr $14, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8e(%rsi), %xmm9 > - palignr $14, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_14_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15): > - sub $0x80, %rdx > - movaps -0x0f(%rsi), %xmm1 > - movaps 0x01(%rsi), %xmm2 > - movaps 0x11(%rsi), %xmm3 > - movaps 0x21(%rsi), %xmm4 > - movaps 0x31(%rsi), %xmm5 > - movaps 0x41(%rsi), %xmm6 > - movaps 0x51(%rsi), %xmm7 > - movaps 0x61(%rsi), %xmm8 > - movaps 0x71(%rsi), %xmm9 > - lea 0x80(%rsi), %rsi > - palignr $15, %xmm8, %xmm9 > - movaps %xmm9, 0x70(%rdi) > - palignr $15, %xmm7, %xmm8 > - movaps %xmm8, 0x60(%rdi) > - palignr $15, %xmm6, %xmm7 > - movaps %xmm7, 0x50(%rdi) > - palignr $15, %xmm5, %xmm6 > - movaps %xmm6, 0x40(%rdi) > - palignr $15, %xmm4, %xmm5 > - movaps %xmm5, 0x30(%rdi) > - palignr $15, %xmm3, %xmm4 > - movaps %xmm4, 0x20(%rdi) > - palignr $15, %xmm2, %xmm3 > - movaps %xmm3, 0x10(%rdi) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdi) > - lea 0x80(%rdi), %rdi > - jae L(shl_15) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - add %rdx, %rdi > - add %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(shl_15_bwd): > - movaps -0x0f(%rsi), %xmm1 > - > - movaps -0x1f(%rsi), %xmm2 > - palignr $15, %xmm2, %xmm1 > - movaps %xmm1, -0x10(%rdi) > - > - movaps -0x2f(%rsi), %xmm3 > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, -0x20(%rdi) > - > - movaps -0x3f(%rsi), %xmm4 > - palignr $15, %xmm4, %xmm3 > - movaps %xmm3, -0x30(%rdi) > - > - movaps -0x4f(%rsi), %xmm5 > - palignr $15, %xmm5, %xmm4 > - movaps %xmm4, -0x40(%rdi) > - > - movaps -0x5f(%rsi), %xmm6 > - palignr $15, %xmm6, %xmm5 > - movaps %xmm5, -0x50(%rdi) > - > - movaps -0x6f(%rsi), %xmm7 > - palignr $15, %xmm7, %xmm6 > - movaps %xmm6, -0x60(%rdi) > - > - movaps -0x7f(%rsi), %xmm8 > - palignr $15, %xmm8, %xmm7 > - movaps %xmm7, -0x70(%rdi) > - > - movaps -0x8f(%rsi), %xmm9 > - palignr $15, %xmm9, %xmm8 > - movaps %xmm8, -0x80(%rdi) > - > - sub $0x80, %rdx > - lea -0x80(%rdi), %rdi > - lea -0x80(%rsi), %rsi > - jae L(shl_15_bwd) > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rdi > - sub %rdx, %rsi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_fwd): > - movdqu (%rsi), %xmm1 > - movdqu %xmm0, (%r8) > - movdqa %xmm1, (%rdi) > - sub $16, %rdx > - add $16, %rsi > - add $16, %rdi > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rsi, %r9 > - sub %rdi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_fwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_fwd_start) > -L(memmove_is_memcpy_fwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger_in_fwd) > - mov %rdx, %rcx > -L(bigger_in_fwd): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy_fwd) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy_fwd) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy_fwd): > - sub $0x80, %rdx > -L(gobble_mem_fwd_loop): > - sub $0x80, %rdx > - prefetcht0 0x200(%rsi) > - prefetcht0 0x300(%rsi) > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - lfence > - movntdq %xmm0, (%rdi) > - movntdq %xmm1, 0x10(%rdi) > - movntdq %xmm2, 0x20(%rdi) > - movntdq %xmm3, 0x30(%rdi) > - movntdq %xmm4, 0x40(%rdi) > - movntdq %xmm5, 0x50(%rdi) > - movntdq %xmm6, 0x60(%rdi) > - movntdq %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_mem_fwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_fwd_end) > - add $0x80, %rdx > -L(ll_cache_copy_fwd): > - add %rcx, %rdx > -L(ll_cache_copy_fwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop_fwd): > - prefetchnta 0x1c0(%rsi) > - prefetchnta 0x280(%rsi) > - prefetchnta 0x1c0(%rdi) > - prefetchnta 0x280(%rdi) > - sub $0x80, %rdx > - movdqu (%rsi), %xmm0 > - movdqu 0x10(%rsi), %xmm1 > - movdqu 0x20(%rsi), %xmm2 > - movdqu 0x30(%rsi), %xmm3 > - movdqu 0x40(%rsi), %xmm4 > - movdqu 0x50(%rsi), %xmm5 > - movdqu 0x60(%rsi), %xmm6 > - movdqu 0x70(%rsi), %xmm7 > - movdqa %xmm0, (%rdi) > - movdqa %xmm1, 0x10(%rdi) > - movdqa %xmm2, 0x20(%rdi) > - movdqa %xmm3, 0x30(%rdi) > - movdqa %xmm4, 0x40(%rdi) > - movdqa %xmm5, 0x50(%rdi) > - movdqa %xmm6, 0x60(%rdi) > - movdqa %xmm7, 0x70(%rdi) > - lea 0x80(%rsi), %rsi > - lea 0x80(%rdi), %rdi > - jae L(gobble_ll_loop_fwd) > -L(gobble_mem_fwd_end): > - add $0x80, %rdx > - add %rdx, %rsi > - add %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4) > - > - .p2align 4 > -L(gobble_mem_bwd): > - add %rdx, %rsi > - add %rdx, %rdi > - > - movdqu -16(%rsi), %xmm0 > - lea -16(%rdi), %r8 > - mov %rdi, %r9 > - and $-16, %rdi > - sub %rdi, %r9 > - sub %r9, %rsi > - sub %r9, %rdx > - > - > -#ifdef SHARED_CACHE_SIZE_HALF > - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP > -#else > - mov __x86_shared_cache_size_half(%rip), %RCX_LP > -#endif > -#ifdef USE_AS_MEMMOVE > - mov %rdi, %r9 > - sub %rsi, %r9 > - cmp %rdx, %r9 > - jae L(memmove_is_memcpy_bwd) > - cmp %rcx, %r9 > - jbe L(ll_cache_copy_bwd_start) > -L(memmove_is_memcpy_bwd): > -#endif > - cmp %rcx, %rdx > - ja L(bigger) > - mov %rdx, %rcx > -L(bigger): > - sub %rcx, %rdx > - cmp $0x1000, %rdx > - jbe L(ll_cache_copy) > - > - mov %rcx, %r9 > - shl $3, %r9 > - cmp %r9, %rdx > - jbe L(2steps_copy) > - add %rcx, %rdx > - xor %rcx, %rcx > -L(2steps_copy): > - sub $0x80, %rdx > -L(gobble_mem_bwd_loop): > - sub $0x80, %rdx > - prefetcht0 -0x200(%rsi) > - prefetcht0 -0x300(%rsi) > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - lfence > - movntdq %xmm1, -0x10(%rdi) > - movntdq %xmm2, -0x20(%rdi) > - movntdq %xmm3, -0x30(%rdi) > - movntdq %xmm4, -0x40(%rdi) > - movntdq %xmm5, -0x50(%rdi) > - movntdq %xmm6, -0x60(%rdi) > - movntdq %xmm7, -0x70(%rdi) > - movntdq %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_mem_bwd_loop) > - sfence > - cmp $0x80, %rcx > - jb L(gobble_mem_bwd_end) > - add $0x80, %rdx > -L(ll_cache_copy): > - add %rcx, %rdx > -L(ll_cache_copy_bwd_start): > - sub $0x80, %rdx > -L(gobble_ll_loop): > - prefetchnta -0x1c0(%rsi) > - prefetchnta -0x280(%rsi) > - prefetchnta -0x1c0(%rdi) > - prefetchnta -0x280(%rdi) > - sub $0x80, %rdx > - movdqu -0x10(%rsi), %xmm1 > - movdqu -0x20(%rsi), %xmm2 > - movdqu -0x30(%rsi), %xmm3 > - movdqu -0x40(%rsi), %xmm4 > - movdqu -0x50(%rsi), %xmm5 > - movdqu -0x60(%rsi), %xmm6 > - movdqu -0x70(%rsi), %xmm7 > - movdqu -0x80(%rsi), %xmm8 > - movdqa %xmm1, -0x10(%rdi) > - movdqa %xmm2, -0x20(%rdi) > - movdqa %xmm3, -0x30(%rdi) > - movdqa %xmm4, -0x40(%rdi) > - movdqa %xmm5, -0x50(%rdi) > - movdqa %xmm6, -0x60(%rdi) > - movdqa %xmm7, -0x70(%rdi) > - movdqa %xmm8, -0x80(%rdi) > - lea -0x80(%rsi), %rsi > - lea -0x80(%rdi), %rdi > - jae L(gobble_ll_loop) > -L(gobble_mem_bwd_end): > - movdqu %xmm0, (%r8) > - add $0x80, %rdx > - sub %rdx, %rsi > - sub %rdx, %rdi > - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4) > - > - .p2align 4 > -L(fwd_write_128bytes): > - lddqu -128(%rsi), %xmm0 > - movdqu %xmm0, -128(%rdi) > -L(fwd_write_112bytes): > - lddqu -112(%rsi), %xmm0 > - movdqu %xmm0, -112(%rdi) > -L(fwd_write_96bytes): > - lddqu -96(%rsi), %xmm0 > - movdqu %xmm0, -96(%rdi) > -L(fwd_write_80bytes): > - lddqu -80(%rsi), %xmm0 > - movdqu %xmm0, -80(%rdi) > -L(fwd_write_64bytes): > - lddqu -64(%rsi), %xmm0 > - movdqu %xmm0, -64(%rdi) > -L(fwd_write_48bytes): > - lddqu -48(%rsi), %xmm0 > - movdqu %xmm0, -48(%rdi) > -L(fwd_write_32bytes): > - lddqu -32(%rsi), %xmm0 > - movdqu %xmm0, -32(%rdi) > -L(fwd_write_16bytes): > - lddqu -16(%rsi), %xmm0 > - movdqu %xmm0, -16(%rdi) > -L(fwd_write_0bytes): > - ret > - > - > - .p2align 4 > -L(fwd_write_143bytes): > - lddqu -143(%rsi), %xmm0 > - movdqu %xmm0, -143(%rdi) > -L(fwd_write_127bytes): > - lddqu -127(%rsi), %xmm0 > - movdqu %xmm0, -127(%rdi) > -L(fwd_write_111bytes): > - lddqu -111(%rsi), %xmm0 > - movdqu %xmm0, -111(%rdi) > -L(fwd_write_95bytes): > - lddqu -95(%rsi), %xmm0 > - movdqu %xmm0, -95(%rdi) > -L(fwd_write_79bytes): > - lddqu -79(%rsi), %xmm0 > - movdqu %xmm0, -79(%rdi) > -L(fwd_write_63bytes): > - lddqu -63(%rsi), %xmm0 > - movdqu %xmm0, -63(%rdi) > -L(fwd_write_47bytes): > - lddqu -47(%rsi), %xmm0 > - movdqu %xmm0, -47(%rdi) > -L(fwd_write_31bytes): > - lddqu -31(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -31(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_15bytes): > - mov -15(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -15(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_142bytes): > - lddqu -142(%rsi), %xmm0 > - movdqu %xmm0, -142(%rdi) > -L(fwd_write_126bytes): > - lddqu -126(%rsi), %xmm0 > - movdqu %xmm0, -126(%rdi) > -L(fwd_write_110bytes): > - lddqu -110(%rsi), %xmm0 > - movdqu %xmm0, -110(%rdi) > -L(fwd_write_94bytes): > - lddqu -94(%rsi), %xmm0 > - movdqu %xmm0, -94(%rdi) > -L(fwd_write_78bytes): > - lddqu -78(%rsi), %xmm0 > - movdqu %xmm0, -78(%rdi) > -L(fwd_write_62bytes): > - lddqu -62(%rsi), %xmm0 > - movdqu %xmm0, -62(%rdi) > -L(fwd_write_46bytes): > - lddqu -46(%rsi), %xmm0 > - movdqu %xmm0, -46(%rdi) > -L(fwd_write_30bytes): > - lddqu -30(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -30(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_14bytes): > - mov -14(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -14(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_141bytes): > - lddqu -141(%rsi), %xmm0 > - movdqu %xmm0, -141(%rdi) > -L(fwd_write_125bytes): > - lddqu -125(%rsi), %xmm0 > - movdqu %xmm0, -125(%rdi) > -L(fwd_write_109bytes): > - lddqu -109(%rsi), %xmm0 > - movdqu %xmm0, -109(%rdi) > -L(fwd_write_93bytes): > - lddqu -93(%rsi), %xmm0 > - movdqu %xmm0, -93(%rdi) > -L(fwd_write_77bytes): > - lddqu -77(%rsi), %xmm0 > - movdqu %xmm0, -77(%rdi) > -L(fwd_write_61bytes): > - lddqu -61(%rsi), %xmm0 > - movdqu %xmm0, -61(%rdi) > -L(fwd_write_45bytes): > - lddqu -45(%rsi), %xmm0 > - movdqu %xmm0, -45(%rdi) > -L(fwd_write_29bytes): > - lddqu -29(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -29(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_13bytes): > - mov -13(%rsi), %rdx > - mov -8(%rsi), %rcx > - mov %rdx, -13(%rdi) > - mov %rcx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_140bytes): > - lddqu -140(%rsi), %xmm0 > - movdqu %xmm0, -140(%rdi) > -L(fwd_write_124bytes): > - lddqu -124(%rsi), %xmm0 > - movdqu %xmm0, -124(%rdi) > -L(fwd_write_108bytes): > - lddqu -108(%rsi), %xmm0 > - movdqu %xmm0, -108(%rdi) > -L(fwd_write_92bytes): > - lddqu -92(%rsi), %xmm0 > - movdqu %xmm0, -92(%rdi) > -L(fwd_write_76bytes): > - lddqu -76(%rsi), %xmm0 > - movdqu %xmm0, -76(%rdi) > -L(fwd_write_60bytes): > - lddqu -60(%rsi), %xmm0 > - movdqu %xmm0, -60(%rdi) > -L(fwd_write_44bytes): > - lddqu -44(%rsi), %xmm0 > - movdqu %xmm0, -44(%rdi) > -L(fwd_write_28bytes): > - lddqu -28(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -28(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_12bytes): > - mov -12(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -12(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_139bytes): > - lddqu -139(%rsi), %xmm0 > - movdqu %xmm0, -139(%rdi) > -L(fwd_write_123bytes): > - lddqu -123(%rsi), %xmm0 > - movdqu %xmm0, -123(%rdi) > -L(fwd_write_107bytes): > - lddqu -107(%rsi), %xmm0 > - movdqu %xmm0, -107(%rdi) > -L(fwd_write_91bytes): > - lddqu -91(%rsi), %xmm0 > - movdqu %xmm0, -91(%rdi) > -L(fwd_write_75bytes): > - lddqu -75(%rsi), %xmm0 > - movdqu %xmm0, -75(%rdi) > -L(fwd_write_59bytes): > - lddqu -59(%rsi), %xmm0 > - movdqu %xmm0, -59(%rdi) > -L(fwd_write_43bytes): > - lddqu -43(%rsi), %xmm0 > - movdqu %xmm0, -43(%rdi) > -L(fwd_write_27bytes): > - lddqu -27(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -27(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_11bytes): > - mov -11(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -11(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_138bytes): > - lddqu -138(%rsi), %xmm0 > - movdqu %xmm0, -138(%rdi) > -L(fwd_write_122bytes): > - lddqu -122(%rsi), %xmm0 > - movdqu %xmm0, -122(%rdi) > -L(fwd_write_106bytes): > - lddqu -106(%rsi), %xmm0 > - movdqu %xmm0, -106(%rdi) > -L(fwd_write_90bytes): > - lddqu -90(%rsi), %xmm0 > - movdqu %xmm0, -90(%rdi) > -L(fwd_write_74bytes): > - lddqu -74(%rsi), %xmm0 > - movdqu %xmm0, -74(%rdi) > -L(fwd_write_58bytes): > - lddqu -58(%rsi), %xmm0 > - movdqu %xmm0, -58(%rdi) > -L(fwd_write_42bytes): > - lddqu -42(%rsi), %xmm0 > - movdqu %xmm0, -42(%rdi) > -L(fwd_write_26bytes): > - lddqu -26(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -26(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_10bytes): > - mov -10(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -10(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_137bytes): > - lddqu -137(%rsi), %xmm0 > - movdqu %xmm0, -137(%rdi) > -L(fwd_write_121bytes): > - lddqu -121(%rsi), %xmm0 > - movdqu %xmm0, -121(%rdi) > -L(fwd_write_105bytes): > - lddqu -105(%rsi), %xmm0 > - movdqu %xmm0, -105(%rdi) > -L(fwd_write_89bytes): > - lddqu -89(%rsi), %xmm0 > - movdqu %xmm0, -89(%rdi) > -L(fwd_write_73bytes): > - lddqu -73(%rsi), %xmm0 > - movdqu %xmm0, -73(%rdi) > -L(fwd_write_57bytes): > - lddqu -57(%rsi), %xmm0 > - movdqu %xmm0, -57(%rdi) > -L(fwd_write_41bytes): > - lddqu -41(%rsi), %xmm0 > - movdqu %xmm0, -41(%rdi) > -L(fwd_write_25bytes): > - lddqu -25(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -25(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_9bytes): > - mov -9(%rsi), %rdx > - mov -4(%rsi), %ecx > - mov %rdx, -9(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_136bytes): > - lddqu -136(%rsi), %xmm0 > - movdqu %xmm0, -136(%rdi) > -L(fwd_write_120bytes): > - lddqu -120(%rsi), %xmm0 > - movdqu %xmm0, -120(%rdi) > -L(fwd_write_104bytes): > - lddqu -104(%rsi), %xmm0 > - movdqu %xmm0, -104(%rdi) > -L(fwd_write_88bytes): > - lddqu -88(%rsi), %xmm0 > - movdqu %xmm0, -88(%rdi) > -L(fwd_write_72bytes): > - lddqu -72(%rsi), %xmm0 > - movdqu %xmm0, -72(%rdi) > -L(fwd_write_56bytes): > - lddqu -56(%rsi), %xmm0 > - movdqu %xmm0, -56(%rdi) > -L(fwd_write_40bytes): > - lddqu -40(%rsi), %xmm0 > - movdqu %xmm0, -40(%rdi) > -L(fwd_write_24bytes): > - lddqu -24(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -24(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_8bytes): > - mov -8(%rsi), %rdx > - mov %rdx, -8(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_135bytes): > - lddqu -135(%rsi), %xmm0 > - movdqu %xmm0, -135(%rdi) > -L(fwd_write_119bytes): > - lddqu -119(%rsi), %xmm0 > - movdqu %xmm0, -119(%rdi) > -L(fwd_write_103bytes): > - lddqu -103(%rsi), %xmm0 > - movdqu %xmm0, -103(%rdi) > -L(fwd_write_87bytes): > - lddqu -87(%rsi), %xmm0 > - movdqu %xmm0, -87(%rdi) > -L(fwd_write_71bytes): > - lddqu -71(%rsi), %xmm0 > - movdqu %xmm0, -71(%rdi) > -L(fwd_write_55bytes): > - lddqu -55(%rsi), %xmm0 > - movdqu %xmm0, -55(%rdi) > -L(fwd_write_39bytes): > - lddqu -39(%rsi), %xmm0 > - movdqu %xmm0, -39(%rdi) > -L(fwd_write_23bytes): > - lddqu -23(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -23(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_7bytes): > - mov -7(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -7(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_134bytes): > - lddqu -134(%rsi), %xmm0 > - movdqu %xmm0, -134(%rdi) > -L(fwd_write_118bytes): > - lddqu -118(%rsi), %xmm0 > - movdqu %xmm0, -118(%rdi) > -L(fwd_write_102bytes): > - lddqu -102(%rsi), %xmm0 > - movdqu %xmm0, -102(%rdi) > -L(fwd_write_86bytes): > - lddqu -86(%rsi), %xmm0 > - movdqu %xmm0, -86(%rdi) > -L(fwd_write_70bytes): > - lddqu -70(%rsi), %xmm0 > - movdqu %xmm0, -70(%rdi) > -L(fwd_write_54bytes): > - lddqu -54(%rsi), %xmm0 > - movdqu %xmm0, -54(%rdi) > -L(fwd_write_38bytes): > - lddqu -38(%rsi), %xmm0 > - movdqu %xmm0, -38(%rdi) > -L(fwd_write_22bytes): > - lddqu -22(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -22(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_6bytes): > - mov -6(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -6(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_133bytes): > - lddqu -133(%rsi), %xmm0 > - movdqu %xmm0, -133(%rdi) > -L(fwd_write_117bytes): > - lddqu -117(%rsi), %xmm0 > - movdqu %xmm0, -117(%rdi) > -L(fwd_write_101bytes): > - lddqu -101(%rsi), %xmm0 > - movdqu %xmm0, -101(%rdi) > -L(fwd_write_85bytes): > - lddqu -85(%rsi), %xmm0 > - movdqu %xmm0, -85(%rdi) > -L(fwd_write_69bytes): > - lddqu -69(%rsi), %xmm0 > - movdqu %xmm0, -69(%rdi) > -L(fwd_write_53bytes): > - lddqu -53(%rsi), %xmm0 > - movdqu %xmm0, -53(%rdi) > -L(fwd_write_37bytes): > - lddqu -37(%rsi), %xmm0 > - movdqu %xmm0, -37(%rdi) > -L(fwd_write_21bytes): > - lddqu -21(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -21(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_5bytes): > - mov -5(%rsi), %edx > - mov -4(%rsi), %ecx > - mov %edx, -5(%rdi) > - mov %ecx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_132bytes): > - lddqu -132(%rsi), %xmm0 > - movdqu %xmm0, -132(%rdi) > -L(fwd_write_116bytes): > - lddqu -116(%rsi), %xmm0 > - movdqu %xmm0, -116(%rdi) > -L(fwd_write_100bytes): > - lddqu -100(%rsi), %xmm0 > - movdqu %xmm0, -100(%rdi) > -L(fwd_write_84bytes): > - lddqu -84(%rsi), %xmm0 > - movdqu %xmm0, -84(%rdi) > -L(fwd_write_68bytes): > - lddqu -68(%rsi), %xmm0 > - movdqu %xmm0, -68(%rdi) > -L(fwd_write_52bytes): > - lddqu -52(%rsi), %xmm0 > - movdqu %xmm0, -52(%rdi) > -L(fwd_write_36bytes): > - lddqu -36(%rsi), %xmm0 > - movdqu %xmm0, -36(%rdi) > -L(fwd_write_20bytes): > - lddqu -20(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -20(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_4bytes): > - mov -4(%rsi), %edx > - mov %edx, -4(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_131bytes): > - lddqu -131(%rsi), %xmm0 > - movdqu %xmm0, -131(%rdi) > -L(fwd_write_115bytes): > - lddqu -115(%rsi), %xmm0 > - movdqu %xmm0, -115(%rdi) > -L(fwd_write_99bytes): > - lddqu -99(%rsi), %xmm0 > - movdqu %xmm0, -99(%rdi) > -L(fwd_write_83bytes): > - lddqu -83(%rsi), %xmm0 > - movdqu %xmm0, -83(%rdi) > -L(fwd_write_67bytes): > - lddqu -67(%rsi), %xmm0 > - movdqu %xmm0, -67(%rdi) > -L(fwd_write_51bytes): > - lddqu -51(%rsi), %xmm0 > - movdqu %xmm0, -51(%rdi) > -L(fwd_write_35bytes): > - lddqu -35(%rsi), %xmm0 > - movdqu %xmm0, -35(%rdi) > -L(fwd_write_19bytes): > - lddqu -19(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -19(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_3bytes): > - mov -3(%rsi), %dx > - mov -2(%rsi), %cx > - mov %dx, -3(%rdi) > - mov %cx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_130bytes): > - lddqu -130(%rsi), %xmm0 > - movdqu %xmm0, -130(%rdi) > -L(fwd_write_114bytes): > - lddqu -114(%rsi), %xmm0 > - movdqu %xmm0, -114(%rdi) > -L(fwd_write_98bytes): > - lddqu -98(%rsi), %xmm0 > - movdqu %xmm0, -98(%rdi) > -L(fwd_write_82bytes): > - lddqu -82(%rsi), %xmm0 > - movdqu %xmm0, -82(%rdi) > -L(fwd_write_66bytes): > - lddqu -66(%rsi), %xmm0 > - movdqu %xmm0, -66(%rdi) > -L(fwd_write_50bytes): > - lddqu -50(%rsi), %xmm0 > - movdqu %xmm0, -50(%rdi) > -L(fwd_write_34bytes): > - lddqu -34(%rsi), %xmm0 > - movdqu %xmm0, -34(%rdi) > -L(fwd_write_18bytes): > - lddqu -18(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -18(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_2bytes): > - movzwl -2(%rsi), %edx > - mov %dx, -2(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_129bytes): > - lddqu -129(%rsi), %xmm0 > - movdqu %xmm0, -129(%rdi) > -L(fwd_write_113bytes): > - lddqu -113(%rsi), %xmm0 > - movdqu %xmm0, -113(%rdi) > -L(fwd_write_97bytes): > - lddqu -97(%rsi), %xmm0 > - movdqu %xmm0, -97(%rdi) > -L(fwd_write_81bytes): > - lddqu -81(%rsi), %xmm0 > - movdqu %xmm0, -81(%rdi) > -L(fwd_write_65bytes): > - lddqu -65(%rsi), %xmm0 > - movdqu %xmm0, -65(%rdi) > -L(fwd_write_49bytes): > - lddqu -49(%rsi), %xmm0 > - movdqu %xmm0, -49(%rdi) > -L(fwd_write_33bytes): > - lddqu -33(%rsi), %xmm0 > - movdqu %xmm0, -33(%rdi) > -L(fwd_write_17bytes): > - lddqu -17(%rsi), %xmm0 > - lddqu -16(%rsi), %xmm1 > - movdqu %xmm0, -17(%rdi) > - movdqu %xmm1, -16(%rdi) > - ret > - > - .p2align 4 > -L(fwd_write_1bytes): > - movzbl -1(%rsi), %edx > - mov %dl, -1(%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_128bytes): > - lddqu 112(%rsi), %xmm0 > - movdqu %xmm0, 112(%rdi) > -L(bwd_write_112bytes): > - lddqu 96(%rsi), %xmm0 > - movdqu %xmm0, 96(%rdi) > -L(bwd_write_96bytes): > - lddqu 80(%rsi), %xmm0 > - movdqu %xmm0, 80(%rdi) > -L(bwd_write_80bytes): > - lddqu 64(%rsi), %xmm0 > - movdqu %xmm0, 64(%rdi) > -L(bwd_write_64bytes): > - lddqu 48(%rsi), %xmm0 > - movdqu %xmm0, 48(%rdi) > -L(bwd_write_48bytes): > - lddqu 32(%rsi), %xmm0 > - movdqu %xmm0, 32(%rdi) > -L(bwd_write_32bytes): > - lddqu 16(%rsi), %xmm0 > - movdqu %xmm0, 16(%rdi) > -L(bwd_write_16bytes): > - lddqu (%rsi), %xmm0 > - movdqu %xmm0, (%rdi) > -L(bwd_write_0bytes): > - ret > - > - .p2align 4 > -L(bwd_write_143bytes): > - lddqu 127(%rsi), %xmm0 > - movdqu %xmm0, 127(%rdi) > -L(bwd_write_127bytes): > - lddqu 111(%rsi), %xmm0 > - movdqu %xmm0, 111(%rdi) > -L(bwd_write_111bytes): > - lddqu 95(%rsi), %xmm0 > - movdqu %xmm0, 95(%rdi) > -L(bwd_write_95bytes): > - lddqu 79(%rsi), %xmm0 > - movdqu %xmm0, 79(%rdi) > -L(bwd_write_79bytes): > - lddqu 63(%rsi), %xmm0 > - movdqu %xmm0, 63(%rdi) > -L(bwd_write_63bytes): > - lddqu 47(%rsi), %xmm0 > - movdqu %xmm0, 47(%rdi) > -L(bwd_write_47bytes): > - lddqu 31(%rsi), %xmm0 > - movdqu %xmm0, 31(%rdi) > -L(bwd_write_31bytes): > - lddqu 15(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 15(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - > - .p2align 4 > -L(bwd_write_15bytes): > - mov 7(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 7(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_142bytes): > - lddqu 126(%rsi), %xmm0 > - movdqu %xmm0, 126(%rdi) > -L(bwd_write_126bytes): > - lddqu 110(%rsi), %xmm0 > - movdqu %xmm0, 110(%rdi) > -L(bwd_write_110bytes): > - lddqu 94(%rsi), %xmm0 > - movdqu %xmm0, 94(%rdi) > -L(bwd_write_94bytes): > - lddqu 78(%rsi), %xmm0 > - movdqu %xmm0, 78(%rdi) > -L(bwd_write_78bytes): > - lddqu 62(%rsi), %xmm0 > - movdqu %xmm0, 62(%rdi) > -L(bwd_write_62bytes): > - lddqu 46(%rsi), %xmm0 > - movdqu %xmm0, 46(%rdi) > -L(bwd_write_46bytes): > - lddqu 30(%rsi), %xmm0 > - movdqu %xmm0, 30(%rdi) > -L(bwd_write_30bytes): > - lddqu 14(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 14(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_14bytes): > - mov 6(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 6(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_141bytes): > - lddqu 125(%rsi), %xmm0 > - movdqu %xmm0, 125(%rdi) > -L(bwd_write_125bytes): > - lddqu 109(%rsi), %xmm0 > - movdqu %xmm0, 109(%rdi) > -L(bwd_write_109bytes): > - lddqu 93(%rsi), %xmm0 > - movdqu %xmm0, 93(%rdi) > -L(bwd_write_93bytes): > - lddqu 77(%rsi), %xmm0 > - movdqu %xmm0, 77(%rdi) > -L(bwd_write_77bytes): > - lddqu 61(%rsi), %xmm0 > - movdqu %xmm0, 61(%rdi) > -L(bwd_write_61bytes): > - lddqu 45(%rsi), %xmm0 > - movdqu %xmm0, 45(%rdi) > -L(bwd_write_45bytes): > - lddqu 29(%rsi), %xmm0 > - movdqu %xmm0, 29(%rdi) > -L(bwd_write_29bytes): > - lddqu 13(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 13(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_13bytes): > - mov 5(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 5(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_140bytes): > - lddqu 124(%rsi), %xmm0 > - movdqu %xmm0, 124(%rdi) > -L(bwd_write_124bytes): > - lddqu 108(%rsi), %xmm0 > - movdqu %xmm0, 108(%rdi) > -L(bwd_write_108bytes): > - lddqu 92(%rsi), %xmm0 > - movdqu %xmm0, 92(%rdi) > -L(bwd_write_92bytes): > - lddqu 76(%rsi), %xmm0 > - movdqu %xmm0, 76(%rdi) > -L(bwd_write_76bytes): > - lddqu 60(%rsi), %xmm0 > - movdqu %xmm0, 60(%rdi) > -L(bwd_write_60bytes): > - lddqu 44(%rsi), %xmm0 > - movdqu %xmm0, 44(%rdi) > -L(bwd_write_44bytes): > - lddqu 28(%rsi), %xmm0 > - movdqu %xmm0, 28(%rdi) > -L(bwd_write_28bytes): > - lddqu 12(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 12(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_12bytes): > - mov 4(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 4(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_139bytes): > - lddqu 123(%rsi), %xmm0 > - movdqu %xmm0, 123(%rdi) > -L(bwd_write_123bytes): > - lddqu 107(%rsi), %xmm0 > - movdqu %xmm0, 107(%rdi) > -L(bwd_write_107bytes): > - lddqu 91(%rsi), %xmm0 > - movdqu %xmm0, 91(%rdi) > -L(bwd_write_91bytes): > - lddqu 75(%rsi), %xmm0 > - movdqu %xmm0, 75(%rdi) > -L(bwd_write_75bytes): > - lddqu 59(%rsi), %xmm0 > - movdqu %xmm0, 59(%rdi) > -L(bwd_write_59bytes): > - lddqu 43(%rsi), %xmm0 > - movdqu %xmm0, 43(%rdi) > -L(bwd_write_43bytes): > - lddqu 27(%rsi), %xmm0 > - movdqu %xmm0, 27(%rdi) > -L(bwd_write_27bytes): > - lddqu 11(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 11(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_11bytes): > - mov 3(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 3(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_138bytes): > - lddqu 122(%rsi), %xmm0 > - movdqu %xmm0, 122(%rdi) > -L(bwd_write_122bytes): > - lddqu 106(%rsi), %xmm0 > - movdqu %xmm0, 106(%rdi) > -L(bwd_write_106bytes): > - lddqu 90(%rsi), %xmm0 > - movdqu %xmm0, 90(%rdi) > -L(bwd_write_90bytes): > - lddqu 74(%rsi), %xmm0 > - movdqu %xmm0, 74(%rdi) > -L(bwd_write_74bytes): > - lddqu 58(%rsi), %xmm0 > - movdqu %xmm0, 58(%rdi) > -L(bwd_write_58bytes): > - lddqu 42(%rsi), %xmm0 > - movdqu %xmm0, 42(%rdi) > -L(bwd_write_42bytes): > - lddqu 26(%rsi), %xmm0 > - movdqu %xmm0, 26(%rdi) > -L(bwd_write_26bytes): > - lddqu 10(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 10(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_10bytes): > - mov 2(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 2(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_137bytes): > - lddqu 121(%rsi), %xmm0 > - movdqu %xmm0, 121(%rdi) > -L(bwd_write_121bytes): > - lddqu 105(%rsi), %xmm0 > - movdqu %xmm0, 105(%rdi) > -L(bwd_write_105bytes): > - lddqu 89(%rsi), %xmm0 > - movdqu %xmm0, 89(%rdi) > -L(bwd_write_89bytes): > - lddqu 73(%rsi), %xmm0 > - movdqu %xmm0, 73(%rdi) > -L(bwd_write_73bytes): > - lddqu 57(%rsi), %xmm0 > - movdqu %xmm0, 57(%rdi) > -L(bwd_write_57bytes): > - lddqu 41(%rsi), %xmm0 > - movdqu %xmm0, 41(%rdi) > -L(bwd_write_41bytes): > - lddqu 25(%rsi), %xmm0 > - movdqu %xmm0, 25(%rdi) > -L(bwd_write_25bytes): > - lddqu 9(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 9(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_9bytes): > - mov 1(%rsi), %rdx > - mov (%rsi), %rcx > - mov %rdx, 1(%rdi) > - mov %rcx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_136bytes): > - lddqu 120(%rsi), %xmm0 > - movdqu %xmm0, 120(%rdi) > -L(bwd_write_120bytes): > - lddqu 104(%rsi), %xmm0 > - movdqu %xmm0, 104(%rdi) > -L(bwd_write_104bytes): > - lddqu 88(%rsi), %xmm0 > - movdqu %xmm0, 88(%rdi) > -L(bwd_write_88bytes): > - lddqu 72(%rsi), %xmm0 > - movdqu %xmm0, 72(%rdi) > -L(bwd_write_72bytes): > - lddqu 56(%rsi), %xmm0 > - movdqu %xmm0, 56(%rdi) > -L(bwd_write_56bytes): > - lddqu 40(%rsi), %xmm0 > - movdqu %xmm0, 40(%rdi) > -L(bwd_write_40bytes): > - lddqu 24(%rsi), %xmm0 > - movdqu %xmm0, 24(%rdi) > -L(bwd_write_24bytes): > - lddqu 8(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 8(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_8bytes): > - mov (%rsi), %rdx > - mov %rdx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_135bytes): > - lddqu 119(%rsi), %xmm0 > - movdqu %xmm0, 119(%rdi) > -L(bwd_write_119bytes): > - lddqu 103(%rsi), %xmm0 > - movdqu %xmm0, 103(%rdi) > -L(bwd_write_103bytes): > - lddqu 87(%rsi), %xmm0 > - movdqu %xmm0, 87(%rdi) > -L(bwd_write_87bytes): > - lddqu 71(%rsi), %xmm0 > - movdqu %xmm0, 71(%rdi) > -L(bwd_write_71bytes): > - lddqu 55(%rsi), %xmm0 > - movdqu %xmm0, 55(%rdi) > -L(bwd_write_55bytes): > - lddqu 39(%rsi), %xmm0 > - movdqu %xmm0, 39(%rdi) > -L(bwd_write_39bytes): > - lddqu 23(%rsi), %xmm0 > - movdqu %xmm0, 23(%rdi) > -L(bwd_write_23bytes): > - lddqu 7(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 7(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_7bytes): > - mov 3(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 3(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_134bytes): > - lddqu 118(%rsi), %xmm0 > - movdqu %xmm0, 118(%rdi) > -L(bwd_write_118bytes): > - lddqu 102(%rsi), %xmm0 > - movdqu %xmm0, 102(%rdi) > -L(bwd_write_102bytes): > - lddqu 86(%rsi), %xmm0 > - movdqu %xmm0, 86(%rdi) > -L(bwd_write_86bytes): > - lddqu 70(%rsi), %xmm0 > - movdqu %xmm0, 70(%rdi) > -L(bwd_write_70bytes): > - lddqu 54(%rsi), %xmm0 > - movdqu %xmm0, 54(%rdi) > -L(bwd_write_54bytes): > - lddqu 38(%rsi), %xmm0 > - movdqu %xmm0, 38(%rdi) > -L(bwd_write_38bytes): > - lddqu 22(%rsi), %xmm0 > - movdqu %xmm0, 22(%rdi) > -L(bwd_write_22bytes): > - lddqu 6(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 6(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_6bytes): > - mov 2(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 2(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_133bytes): > - lddqu 117(%rsi), %xmm0 > - movdqu %xmm0, 117(%rdi) > -L(bwd_write_117bytes): > - lddqu 101(%rsi), %xmm0 > - movdqu %xmm0, 101(%rdi) > -L(bwd_write_101bytes): > - lddqu 85(%rsi), %xmm0 > - movdqu %xmm0, 85(%rdi) > -L(bwd_write_85bytes): > - lddqu 69(%rsi), %xmm0 > - movdqu %xmm0, 69(%rdi) > -L(bwd_write_69bytes): > - lddqu 53(%rsi), %xmm0 > - movdqu %xmm0, 53(%rdi) > -L(bwd_write_53bytes): > - lddqu 37(%rsi), %xmm0 > - movdqu %xmm0, 37(%rdi) > -L(bwd_write_37bytes): > - lddqu 21(%rsi), %xmm0 > - movdqu %xmm0, 21(%rdi) > -L(bwd_write_21bytes): > - lddqu 5(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 5(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_5bytes): > - mov 1(%rsi), %edx > - mov (%rsi), %ecx > - mov %edx, 1(%rdi) > - mov %ecx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_132bytes): > - lddqu 116(%rsi), %xmm0 > - movdqu %xmm0, 116(%rdi) > -L(bwd_write_116bytes): > - lddqu 100(%rsi), %xmm0 > - movdqu %xmm0, 100(%rdi) > -L(bwd_write_100bytes): > - lddqu 84(%rsi), %xmm0 > - movdqu %xmm0, 84(%rdi) > -L(bwd_write_84bytes): > - lddqu 68(%rsi), %xmm0 > - movdqu %xmm0, 68(%rdi) > -L(bwd_write_68bytes): > - lddqu 52(%rsi), %xmm0 > - movdqu %xmm0, 52(%rdi) > -L(bwd_write_52bytes): > - lddqu 36(%rsi), %xmm0 > - movdqu %xmm0, 36(%rdi) > -L(bwd_write_36bytes): > - lddqu 20(%rsi), %xmm0 > - movdqu %xmm0, 20(%rdi) > -L(bwd_write_20bytes): > - lddqu 4(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 4(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_4bytes): > - mov (%rsi), %edx > - mov %edx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_131bytes): > - lddqu 115(%rsi), %xmm0 > - movdqu %xmm0, 115(%rdi) > -L(bwd_write_115bytes): > - lddqu 99(%rsi), %xmm0 > - movdqu %xmm0, 99(%rdi) > -L(bwd_write_99bytes): > - lddqu 83(%rsi), %xmm0 > - movdqu %xmm0, 83(%rdi) > -L(bwd_write_83bytes): > - lddqu 67(%rsi), %xmm0 > - movdqu %xmm0, 67(%rdi) > -L(bwd_write_67bytes): > - lddqu 51(%rsi), %xmm0 > - movdqu %xmm0, 51(%rdi) > -L(bwd_write_51bytes): > - lddqu 35(%rsi), %xmm0 > - movdqu %xmm0, 35(%rdi) > -L(bwd_write_35bytes): > - lddqu 19(%rsi), %xmm0 > - movdqu %xmm0, 19(%rdi) > -L(bwd_write_19bytes): > - lddqu 3(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 3(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_3bytes): > - mov 1(%rsi), %dx > - mov (%rsi), %cx > - mov %dx, 1(%rdi) > - mov %cx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_130bytes): > - lddqu 114(%rsi), %xmm0 > - movdqu %xmm0, 114(%rdi) > -L(bwd_write_114bytes): > - lddqu 98(%rsi), %xmm0 > - movdqu %xmm0, 98(%rdi) > -L(bwd_write_98bytes): > - lddqu 82(%rsi), %xmm0 > - movdqu %xmm0, 82(%rdi) > -L(bwd_write_82bytes): > - lddqu 66(%rsi), %xmm0 > - movdqu %xmm0, 66(%rdi) > -L(bwd_write_66bytes): > - lddqu 50(%rsi), %xmm0 > - movdqu %xmm0, 50(%rdi) > -L(bwd_write_50bytes): > - lddqu 34(%rsi), %xmm0 > - movdqu %xmm0, 34(%rdi) > -L(bwd_write_34bytes): > - lddqu 18(%rsi), %xmm0 > - movdqu %xmm0, 18(%rdi) > -L(bwd_write_18bytes): > - lddqu 2(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 2(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_2bytes): > - movzwl (%rsi), %edx > - mov %dx, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_129bytes): > - lddqu 113(%rsi), %xmm0 > - movdqu %xmm0, 113(%rdi) > -L(bwd_write_113bytes): > - lddqu 97(%rsi), %xmm0 > - movdqu %xmm0, 97(%rdi) > -L(bwd_write_97bytes): > - lddqu 81(%rsi), %xmm0 > - movdqu %xmm0, 81(%rdi) > -L(bwd_write_81bytes): > - lddqu 65(%rsi), %xmm0 > - movdqu %xmm0, 65(%rdi) > -L(bwd_write_65bytes): > - lddqu 49(%rsi), %xmm0 > - movdqu %xmm0, 49(%rdi) > -L(bwd_write_49bytes): > - lddqu 33(%rsi), %xmm0 > - movdqu %xmm0, 33(%rdi) > -L(bwd_write_33bytes): > - lddqu 17(%rsi), %xmm0 > - movdqu %xmm0, 17(%rdi) > -L(bwd_write_17bytes): > - lddqu 1(%rsi), %xmm0 > - lddqu (%rsi), %xmm1 > - movdqu %xmm0, 1(%rdi) > - movdqu %xmm1, (%rdi) > - ret > - > - .p2align 4 > -L(bwd_write_1bytes): > - movzbl (%rsi), %edx > - mov %dl, (%rdi) > - ret > - > -END (MEMCPY) > - > - .section .rodata.ssse3,"a",@progbits > - .p2align 3 > -L(table_144_bytes_bwd): > - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd)) > - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd)) > - > - .p2align 3 > -L(table_144_bytes_fwd): > - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd)) > - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd)) > - > - .p2align 3 > -L(shl_table_fwd): > - .int JMPTBL (L(shl_0), L(shl_table_fwd)) > - .int JMPTBL (L(shl_1), L(shl_table_fwd)) > - .int JMPTBL (L(shl_2), L(shl_table_fwd)) > - .int JMPTBL (L(shl_3), L(shl_table_fwd)) > - .int JMPTBL (L(shl_4), L(shl_table_fwd)) > - .int JMPTBL (L(shl_5), L(shl_table_fwd)) > - .int JMPTBL (L(shl_6), L(shl_table_fwd)) > - .int JMPTBL (L(shl_7), L(shl_table_fwd)) > - .int JMPTBL (L(shl_8), L(shl_table_fwd)) > - .int JMPTBL (L(shl_9), L(shl_table_fwd)) > - .int JMPTBL (L(shl_10), L(shl_table_fwd)) > - .int JMPTBL (L(shl_11), L(shl_table_fwd)) > - .int JMPTBL (L(shl_12), L(shl_table_fwd)) > - .int JMPTBL (L(shl_13), L(shl_table_fwd)) > - .int JMPTBL (L(shl_14), L(shl_table_fwd)) > - .int JMPTBL (L(shl_15), L(shl_table_fwd)) > - > - .p2align 3 > -L(shl_table_bwd): > - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd)) > - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd)) > - > -#endif > diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > deleted file mode 100644 > index f9a4e9aff9..0000000000 > --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_MEMMOVE > -#define MEMCPY __memmove_ssse3_back > -#define MEMCPY_CHK __memmove_chk_ssse3_back > -#include "memcpy-ssse3-back.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (2 preceding siblings ...) 2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein @ 2022-03-25 18:36 ` Noah Goldstein 2022-03-25 19:57 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein ` (2 subsequent siblings) 6 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 2 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - 5 files changed, 879 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index 323be3b969..a2ebc06c5f 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -59,7 +59,6 @@ sysdep_routines += \ strcat-evex \ strcat-sse2 \ strcat-sse2-unaligned \ - strcat-ssse3 \ strchr-avx2 \ strchr-avx2-rtm \ strchr-evex \ @@ -97,7 +96,6 @@ sysdep_routines += \ strncat-c \ strncat-evex \ strncat-sse2-unaligned \ - strncat-ssse3 \ strncmp-avx2 \ strncmp-avx2-rtm \ strncmp-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index d6852ab365..4133ed7e43 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcat_evex) - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), - __strcat_ssse3) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) @@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncat_evex) - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), - __strncat_ssse3) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h index 5bece38f78..a15afa44e9 100644 --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h @@ -23,7 +23,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) return OPTIMIZE (sse2_unaligned); - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) - return OPTIMIZE (ssse3); - return OPTIMIZE (sse2); } diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S deleted file mode 100644 index 9f39e4fcd1..0000000000 --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S +++ /dev/null @@ -1,866 +0,0 @@ -/* strcat with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# include <sysdep.h> - -# ifndef STRCAT -# define STRCAT __strcat_ssse3 -# endif - -# define USE_AS_STRCAT - -.text -ENTRY (STRCAT) -# ifdef USE_AS_STRNCAT - mov %rdx, %r8 -# endif - - -/* Inline corresponding strlen file, temporary until new strcpy - implementation gets merged. */ - - xor %eax, %eax - cmpb $0, (%rdi) - jz L(exit_tail0) - cmpb $0, 1(%rdi) - jz L(exit_tail1) - cmpb $0, 2(%rdi) - jz L(exit_tail2) - cmpb $0, 3(%rdi) - jz L(exit_tail3) - - cmpb $0, 4(%rdi) - jz L(exit_tail4) - cmpb $0, 5(%rdi) - jz L(exit_tail5) - cmpb $0, 6(%rdi) - jz L(exit_tail6) - cmpb $0, 7(%rdi) - jz L(exit_tail7) - - cmpb $0, 8(%rdi) - jz L(exit_tail8) - cmpb $0, 9(%rdi) - jz L(exit_tail9) - cmpb $0, 10(%rdi) - jz L(exit_tail10) - cmpb $0, 11(%rdi) - jz L(exit_tail11) - - cmpb $0, 12(%rdi) - jz L(exit_tail12) - cmpb $0, 13(%rdi) - jz L(exit_tail13) - cmpb $0, 14(%rdi) - jz L(exit_tail14) - cmpb $0, 15(%rdi) - jz L(exit_tail15) - pxor %xmm0, %xmm0 - lea 16(%rdi), %rcx - lea 16(%rdi), %rax - and $-16, %rax - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - pxor %xmm1, %xmm1 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - pxor %xmm2, %xmm2 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - pxor %xmm3, %xmm3 - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm0 - pmovmskb %xmm0, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm1 - pmovmskb %xmm1, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm2 - pmovmskb %xmm2, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - pcmpeqb (%rax), %xmm3 - pmovmskb %xmm3, %edx - test %edx, %edx - lea 16(%rax), %rax - jnz L(exit) - - and $-0x40, %rax - - .p2align 4 -L(aligned_64): - pcmpeqb (%rax), %xmm0 - pcmpeqb 16(%rax), %xmm1 - pcmpeqb 32(%rax), %xmm2 - pcmpeqb 48(%rax), %xmm3 - pmovmskb %xmm0, %edx - pmovmskb %xmm1, %r11d - pmovmskb %xmm2, %r10d - pmovmskb %xmm3, %r9d - or %edx, %r9d - or %r11d, %r9d - or %r10d, %r9d - lea 64(%rax), %rax - jz L(aligned_64) - - test %edx, %edx - jnz L(aligned_64_exit_16) - test %r11d, %r11d - jnz L(aligned_64_exit_32) - test %r10d, %r10d - jnz L(aligned_64_exit_48) - -L(aligned_64_exit_64): - pmovmskb %xmm3, %edx - jmp L(exit) - -L(aligned_64_exit_48): - lea -16(%rax), %rax - mov %r10d, %edx - jmp L(exit) - -L(aligned_64_exit_32): - lea -32(%rax), %rax - mov %r11d, %edx - jmp L(exit) - -L(aligned_64_exit_16): - lea -48(%rax), %rax - -L(exit): - sub %rcx, %rax - test %dl, %dl - jz L(exit_high) - test $0x01, %dl - jnz L(exit_tail0) - - test $0x02, %dl - jnz L(exit_tail1) - - test $0x04, %dl - jnz L(exit_tail2) - - test $0x08, %dl - jnz L(exit_tail3) - - test $0x10, %dl - jnz L(exit_tail4) - - test $0x20, %dl - jnz L(exit_tail5) - - test $0x40, %dl - jnz L(exit_tail6) - add $7, %eax -L(exit_tail0): - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_high): - add $8, %eax - test $0x01, %dh - jnz L(exit_tail0) - - test $0x02, %dh - jnz L(exit_tail1) - - test $0x04, %dh - jnz L(exit_tail2) - - test $0x08, %dh - jnz L(exit_tail3) - - test $0x10, %dh - jnz L(exit_tail4) - - test $0x20, %dh - jnz L(exit_tail5) - - test $0x40, %dh - jnz L(exit_tail6) - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail1): - add $1, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail2): - add $2, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail3): - add $3, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail4): - add $4, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail5): - add $5, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail6): - add $6, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail7): - add $7, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail8): - add $8, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail9): - add $9, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail10): - add $10, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail11): - add $11, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail12): - add $12, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail13): - add $13, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail14): - add $14, %eax - jmp L(StartStrcpyPart) - - .p2align 4 -L(exit_tail15): - add $15, %eax - - .p2align 4 -L(StartStrcpyPart): - mov %rsi, %rcx - lea (%rdi, %rax), %rdx -# ifdef USE_AS_STRNCAT - test %r8, %r8 - jz L(StrncatExit0) - cmp $8, %r8 - jbe L(StrncatExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) - cmpb $0, 8(%rcx) - jz L(Exit9) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - jb L(StrncatExit15Bytes) -# endif - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) - cmpb $0, 15(%rcx) - jz L(Exit16) -# ifdef USE_AS_STRNCAT - cmp $16, %r8 - je L(StrncatExit16) -# define USE_AS_STRNCPY -# endif - -# include "strcpy-ssse3.S" - - .p2align 4 -L(CopyFrom1To16Bytes): - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit1): - xor %ah, %ah - movb %ah, 1(%rdx) -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit2): - xor %ah, %ah - movb %ah, 2(%rdx) -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit3): - xor %ah, %ah - movb %ah, 3(%rdx) -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit4): - xor %ah, %ah - movb %ah, 4(%rdx) -L(Exit4): - mov (%rcx), %eax - mov %eax, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit5): - xor %ah, %ah - movb %ah, 5(%rdx) -L(Exit5): - mov (%rcx), %eax - mov %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit6): - xor %ah, %ah - movb %ah, 6(%rdx) -L(Exit6): - mov (%rcx), %eax - mov %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit7): - xor %ah, %ah - movb %ah, 7(%rdx) -L(Exit7): - mov (%rcx), %eax - mov %eax, (%rdx) - mov 3(%rcx), %eax - mov %eax, 3(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8): - xor %ah, %ah - movb %ah, 8(%rdx) -L(Exit8): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit9): - xor %ah, %ah - movb %ah, 9(%rdx) -L(Exit9): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movb 8(%rcx), %al - movb %al, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit10): - xor %ah, %ah - movb %ah, 10(%rdx) -L(Exit10): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movw 8(%rcx), %ax - movw %ax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit11): - xor %ah, %ah - movb %ah, 11(%rdx) -L(Exit11): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit12): - xor %ah, %ah - movb %ah, 12(%rdx) -L(Exit12): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit13): - xor %ah, %ah - movb %ah, 13(%rdx) -L(Exit13): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 5(%rcx), %xmm1 - movlpd %xmm1, 5(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit14): - xor %ah, %ah - movb %ah, 14(%rdx) -L(Exit14): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 6(%rcx), %xmm1 - movlpd %xmm1, 6(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15): - xor %ah, %ah - movb %ah, 15(%rdx) -L(Exit15): - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit16): - xor %ah, %ah - movb %ah, 16(%rdx) -L(Exit16): - movlpd (%rcx), %xmm0 - movlpd 8(%rcx), %xmm1 - movlpd %xmm0, (%rdx) - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - test $0x01, %al - jnz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - test $0x02, %al - jnz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - test $0x04, %al - jnz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - test $0x08, %al - jnz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - test $0x10, %al - jnz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - test $0x20, %al - jnz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - test $0x40, %al - jnz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase2): - test $0x01, %ah - jnz L(Exit9) - cmp $9, %r8 - je L(StrncatExit9) - test $0x02, %ah - jnz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - test $0x04, %ah - jnz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - test $0x8, %ah - jnz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - test $0x10, %ah - jnz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - test $0x20, %ah - jnz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - test $0x40, %ah - jnz L(Exit15) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - mov %rdi, %rax - ret - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $8, %r8 - ja L(ExitHighCase3) - cmp $1, %r8 - je L(StrncatExit1) - cmp $2, %r8 - je L(StrncatExit2) - cmp $3, %r8 - je L(StrncatExit3) - cmp $4, %r8 - je L(StrncatExit4) - cmp $5, %r8 - je L(StrncatExit5) - cmp $6, %r8 - je L(StrncatExit6) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - xor %ah, %ah - movb %ah, 8(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(ExitHighCase3): - cmp $9, %r8 - je L(StrncatExit9) - cmp $10, %r8 - je L(StrncatExit10) - cmp $11, %r8 - je L(StrncatExit11) - cmp $12, %r8 - je L(StrncatExit12) - cmp $13, %r8 - je L(StrncatExit13) - cmp $14, %r8 - je L(StrncatExit14) - cmp $15, %r8 - je L(StrncatExit15) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 8(%rcx), %xmm1 - movlpd %xmm1, 8(%rdx) - xor %ah, %ah - movb %ah, 16(%rdx) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit0): - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit15Bytes): - cmp $9, %r8 - je L(StrncatExit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $10, %r8 - je L(StrncatExit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $11, %r8 - je L(StrncatExit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $12, %r8 - je L(StrncatExit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $13, %r8 - je L(StrncatExit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmp $14, %r8 - je L(StrncatExit14) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - movlpd 7(%rcx), %xmm1 - movlpd %xmm1, 7(%rdx) - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - - .p2align 4 -L(StrncatExit8Bytes): - cmpb $0, (%rcx) - jz L(Exit1) - cmp $1, %r8 - je L(StrncatExit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $2, %r8 - je L(StrncatExit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $3, %r8 - je L(StrncatExit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $4, %r8 - je L(StrncatExit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $5, %r8 - je L(StrncatExit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $6, %r8 - je L(StrncatExit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmp $7, %r8 - je L(StrncatExit7) - movlpd (%rcx), %xmm0 - movlpd %xmm0, (%rdx) - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax - xor %cl, %cl - movb %cl, (%rax) - mov %rdi, %rax - ret - -# endif -END (STRCAT) -#endif diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S deleted file mode 100644 index 6c45ff3ec7..0000000000 --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCAT -#define STRCAT __strncat_ssse3 -#include "strcat-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-03-25 19:57 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 - > sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 --------------------- > sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 - > 5 files changed, 879 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 323be3b969..a2ebc06c5f 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -59,7 +59,6 @@ sysdep_routines += \ > strcat-evex \ > strcat-sse2 \ > strcat-sse2-unaligned \ > - strcat-ssse3 \ > strchr-avx2 \ > strchr-avx2-rtm \ > strchr-evex \ > @@ -97,7 +96,6 @@ sysdep_routines += \ > strncat-c \ > strncat-evex \ > strncat-sse2-unaligned \ > - strncat-ssse3 \ > strncmp-avx2 \ > strncmp-avx2-rtm \ > strncmp-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index d6852ab365..4133ed7e43 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcat_evex) > - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), > - __strcat_ssse3) > IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2)) > > @@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncat_evex) > - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), > - __strncat_ssse3) > IFUNC_IMPL_ADD (array, i, strncat, 1, > __strncat_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2)) > diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > index 5bece38f78..a15afa44e9 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h > +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h > @@ -23,7 +23,6 @@ > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) > attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; > @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) > return OPTIMIZE (sse2_unaligned); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S > deleted file mode 100644 > index 9f39e4fcd1..0000000000 > --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S > +++ /dev/null > @@ -1,866 +0,0 @@ > -/* strcat with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > - > -# ifndef STRCAT > -# define STRCAT __strcat_ssse3 > -# endif > - > -# define USE_AS_STRCAT > - > -.text > -ENTRY (STRCAT) > -# ifdef USE_AS_STRNCAT > - mov %rdx, %r8 > -# endif > - > - > -/* Inline corresponding strlen file, temporary until new strcpy > - implementation gets merged. */ > - > - xor %eax, %eax > - cmpb $0, (%rdi) > - jz L(exit_tail0) > - cmpb $0, 1(%rdi) > - jz L(exit_tail1) > - cmpb $0, 2(%rdi) > - jz L(exit_tail2) > - cmpb $0, 3(%rdi) > - jz L(exit_tail3) > - > - cmpb $0, 4(%rdi) > - jz L(exit_tail4) > - cmpb $0, 5(%rdi) > - jz L(exit_tail5) > - cmpb $0, 6(%rdi) > - jz L(exit_tail6) > - cmpb $0, 7(%rdi) > - jz L(exit_tail7) > - > - cmpb $0, 8(%rdi) > - jz L(exit_tail8) > - cmpb $0, 9(%rdi) > - jz L(exit_tail9) > - cmpb $0, 10(%rdi) > - jz L(exit_tail10) > - cmpb $0, 11(%rdi) > - jz L(exit_tail11) > - > - cmpb $0, 12(%rdi) > - jz L(exit_tail12) > - cmpb $0, 13(%rdi) > - jz L(exit_tail13) > - cmpb $0, 14(%rdi) > - jz L(exit_tail14) > - cmpb $0, 15(%rdi) > - jz L(exit_tail15) > - pxor %xmm0, %xmm0 > - lea 16(%rdi), %rcx > - lea 16(%rdi), %rax > - and $-16, %rax > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - pxor %xmm1, %xmm1 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - pxor %xmm2, %xmm2 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - pxor %xmm3, %xmm3 > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm0 > - pmovmskb %xmm0, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm1 > - pmovmskb %xmm1, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm2 > - pmovmskb %xmm2, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - pcmpeqb (%rax), %xmm3 > - pmovmskb %xmm3, %edx > - test %edx, %edx > - lea 16(%rax), %rax > - jnz L(exit) > - > - and $-0x40, %rax > - > - .p2align 4 > -L(aligned_64): > - pcmpeqb (%rax), %xmm0 > - pcmpeqb 16(%rax), %xmm1 > - pcmpeqb 32(%rax), %xmm2 > - pcmpeqb 48(%rax), %xmm3 > - pmovmskb %xmm0, %edx > - pmovmskb %xmm1, %r11d > - pmovmskb %xmm2, %r10d > - pmovmskb %xmm3, %r9d > - or %edx, %r9d > - or %r11d, %r9d > - or %r10d, %r9d > - lea 64(%rax), %rax > - jz L(aligned_64) > - > - test %edx, %edx > - jnz L(aligned_64_exit_16) > - test %r11d, %r11d > - jnz L(aligned_64_exit_32) > - test %r10d, %r10d > - jnz L(aligned_64_exit_48) > - > -L(aligned_64_exit_64): > - pmovmskb %xmm3, %edx > - jmp L(exit) > - > -L(aligned_64_exit_48): > - lea -16(%rax), %rax > - mov %r10d, %edx > - jmp L(exit) > - > -L(aligned_64_exit_32): > - lea -32(%rax), %rax > - mov %r11d, %edx > - jmp L(exit) > - > -L(aligned_64_exit_16): > - lea -48(%rax), %rax > - > -L(exit): > - sub %rcx, %rax > - test %dl, %dl > - jz L(exit_high) > - test $0x01, %dl > - jnz L(exit_tail0) > - > - test $0x02, %dl > - jnz L(exit_tail1) > - > - test $0x04, %dl > - jnz L(exit_tail2) > - > - test $0x08, %dl > - jnz L(exit_tail3) > - > - test $0x10, %dl > - jnz L(exit_tail4) > - > - test $0x20, %dl > - jnz L(exit_tail5) > - > - test $0x40, %dl > - jnz L(exit_tail6) > - add $7, %eax > -L(exit_tail0): > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_high): > - add $8, %eax > - test $0x01, %dh > - jnz L(exit_tail0) > - > - test $0x02, %dh > - jnz L(exit_tail1) > - > - test $0x04, %dh > - jnz L(exit_tail2) > - > - test $0x08, %dh > - jnz L(exit_tail3) > - > - test $0x10, %dh > - jnz L(exit_tail4) > - > - test $0x20, %dh > - jnz L(exit_tail5) > - > - test $0x40, %dh > - jnz L(exit_tail6) > - add $7, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail1): > - add $1, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail2): > - add $2, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail3): > - add $3, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail4): > - add $4, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail5): > - add $5, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail6): > - add $6, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail7): > - add $7, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail8): > - add $8, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail9): > - add $9, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail10): > - add $10, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail11): > - add $11, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail12): > - add $12, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail13): > - add $13, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail14): > - add $14, %eax > - jmp L(StartStrcpyPart) > - > - .p2align 4 > -L(exit_tail15): > - add $15, %eax > - > - .p2align 4 > -L(StartStrcpyPart): > - mov %rsi, %rcx > - lea (%rdi, %rax), %rdx > -# ifdef USE_AS_STRNCAT > - test %r8, %r8 > - jz L(StrncatExit0) > - cmp $8, %r8 > - jbe L(StrncatExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > -# ifdef USE_AS_STRNCAT > - cmp $16, %r8 > - jb L(StrncatExit15Bytes) > -# endif > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# ifdef USE_AS_STRNCAT > - cmp $16, %r8 > - je L(StrncatExit16) > -# define USE_AS_STRNCPY > -# endif > - > -# include "strcpy-ssse3.S" > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - movlpd (%rcx), %xmm0 > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm0, (%rdx) > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit1): > - xor %ah, %ah > - movb %ah, 1(%rdx) > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit2): > - xor %ah, %ah > - movb %ah, 2(%rdx) > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit3): > - xor %ah, %ah > - movb %ah, 3(%rdx) > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit4): > - xor %ah, %ah > - movb %ah, 4(%rdx) > -L(Exit4): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit5): > - xor %ah, %ah > - movb %ah, 5(%rdx) > -L(Exit5): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit6): > - xor %ah, %ah > - movb %ah, 6(%rdx) > -L(Exit6): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit7): > - xor %ah, %ah > - movb %ah, 7(%rdx) > -L(Exit7): > - mov (%rcx), %eax > - mov %eax, (%rdx) > - mov 3(%rcx), %eax > - mov %eax, 3(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit8): > - xor %ah, %ah > - movb %ah, 8(%rdx) > -L(Exit8): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit9): > - xor %ah, %ah > - movb %ah, 9(%rdx) > -L(Exit9): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movb 8(%rcx), %al > - movb %al, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit10): > - xor %ah, %ah > - movb %ah, 10(%rdx) > -L(Exit10): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movw 8(%rcx), %ax > - movw %ax, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit11): > - xor %ah, %ah > - movb %ah, 11(%rdx) > -L(Exit11): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit12): > - xor %ah, %ah > - movb %ah, 12(%rdx) > -L(Exit12): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit13): > - xor %ah, %ah > - movb %ah, 13(%rdx) > -L(Exit13): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 5(%rcx), %xmm1 > - movlpd %xmm1, 5(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit14): > - xor %ah, %ah > - movb %ah, 14(%rdx) > -L(Exit14): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 6(%rcx), %xmm1 > - movlpd %xmm1, 6(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit15): > - xor %ah, %ah > - movb %ah, 15(%rdx) > -L(Exit15): > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 7(%rcx), %xmm1 > - movlpd %xmm1, 7(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit16): > - xor %ah, %ah > - movb %ah, 16(%rdx) > -L(Exit16): > - movlpd (%rcx), %xmm0 > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm0, (%rdx) > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - test $0x01, %al > - jnz L(Exit1) > - cmp $1, %r8 > - je L(StrncatExit1) > - test $0x02, %al > - jnz L(Exit2) > - cmp $2, %r8 > - je L(StrncatExit2) > - test $0x04, %al > - jnz L(Exit3) > - cmp $3, %r8 > - je L(StrncatExit3) > - test $0x08, %al > - jnz L(Exit4) > - cmp $4, %r8 > - je L(StrncatExit4) > - test $0x10, %al > - jnz L(Exit5) > - cmp $5, %r8 > - je L(StrncatExit5) > - test $0x20, %al > - jnz L(Exit6) > - cmp $6, %r8 > - je L(StrncatExit6) > - test $0x40, %al > - jnz L(Exit7) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHighCase2): > - test $0x01, %ah > - jnz L(Exit9) > - cmp $9, %r8 > - je L(StrncatExit9) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $10, %r8 > - je L(StrncatExit10) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $11, %r8 > - je L(StrncatExit11) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $12, %r8 > - je L(StrncatExit12) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $13, %r8 > - je L(StrncatExit13) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $14, %r8 > - je L(StrncatExit14) > - test $0x40, %ah > - jnz L(Exit15) > - cmp $15, %r8 > - je L(StrncatExit15) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm1, 8(%rdx) > - mov %rdi, %rax > - ret > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $8, %r8 > - ja L(ExitHighCase3) > - cmp $1, %r8 > - je L(StrncatExit1) > - cmp $2, %r8 > - je L(StrncatExit2) > - cmp $3, %r8 > - je L(StrncatExit3) > - cmp $4, %r8 > - je L(StrncatExit4) > - cmp $5, %r8 > - je L(StrncatExit5) > - cmp $6, %r8 > - je L(StrncatExit6) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - xor %ah, %ah > - movb %ah, 8(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(ExitHighCase3): > - cmp $9, %r8 > - je L(StrncatExit9) > - cmp $10, %r8 > - je L(StrncatExit10) > - cmp $11, %r8 > - je L(StrncatExit11) > - cmp $12, %r8 > - je L(StrncatExit12) > - cmp $13, %r8 > - je L(StrncatExit13) > - cmp $14, %r8 > - je L(StrncatExit14) > - cmp $15, %r8 > - je L(StrncatExit15) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 8(%rcx), %xmm1 > - movlpd %xmm1, 8(%rdx) > - xor %ah, %ah > - movb %ah, 16(%rdx) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit0): > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit15Bytes): > - cmp $9, %r8 > - je L(StrncatExit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $10, %r8 > - je L(StrncatExit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $11, %r8 > - je L(StrncatExit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $12, %r8 > - je L(StrncatExit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $13, %r8 > - je L(StrncatExit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmp $14, %r8 > - je L(StrncatExit14) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - movlpd 7(%rcx), %xmm1 > - movlpd %xmm1, 7(%rdx) > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > - .p2align 4 > -L(StrncatExit8Bytes): > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $1, %r8 > - je L(StrncatExit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $2, %r8 > - je L(StrncatExit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $3, %r8 > - je L(StrncatExit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $4, %r8 > - je L(StrncatExit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $5, %r8 > - je L(StrncatExit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $6, %r8 > - je L(StrncatExit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmp $7, %r8 > - je L(StrncatExit7) > - movlpd (%rcx), %xmm0 > - movlpd %xmm0, (%rdx) > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > - xor %cl, %cl > - movb %cl, (%rax) > - mov %rdi, %rax > - ret > - > -# endif > -END (STRCAT) > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S > deleted file mode 100644 > index 6c45ff3ec7..0000000000 > --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCAT > -#define STRCAT __strncat_ssse3 > -#include "strcat-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (3 preceding siblings ...) 2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein @ 2022-03-25 18:36 ` Noah Goldstein 2022-03-25 19:57 ` H.J. Lu 2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu 2022-03-25 20:34 ` Andreas Schwab 6 siblings, 1 reply; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw) To: libc-alpha With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer SSSE3. As a result its no longer with the code size cost. --- sysdeps/x86_64/multiarch/Makefile | 4 - sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - 6 files changed, 3572 deletions(-) delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile index a2ebc06c5f..292353bad7 100644 --- a/sysdeps/x86_64/multiarch/Makefile +++ b/sysdeps/x86_64/multiarch/Makefile @@ -42,13 +42,11 @@ sysdep_routines += \ stpcpy-evex \ stpcpy-sse2 \ stpcpy-sse2-unaligned \ - stpcpy-ssse3 \ stpncpy-avx2 \ stpncpy-avx2-rtm \ stpncpy-c \ stpncpy-evex \ stpncpy-sse2-unaligned \ - stpncpy-ssse3 \ strcasecmp_l-avx2 \ strcasecmp_l-avx2-rtm \ strcasecmp_l-evex \ @@ -79,7 +77,6 @@ sysdep_routines += \ strcpy-evex \ strcpy-sse2 \ strcpy-sse2-unaligned \ - strcpy-ssse3 \ strcspn-c \ strcspn-sse2 \ strlen-avx2 \ @@ -106,7 +103,6 @@ sysdep_routines += \ strncpy-c \ strncpy-evex \ strncpy-sse2-unaligned \ - strncpy-ssse3 \ strnlen-avx2 \ strnlen-avx2-rtm \ strnlen-evex \ diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c index 4133ed7e43..505b8002e0 100644 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ IFUNC_IMPL (i, name, stpncpy, - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), - __stpncpy_ssse3) IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), __stpncpy_avx2) IFUNC_IMPL_ADD (array, i, stpncpy, @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ IFUNC_IMPL (i, name, stpcpy, - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), - __stpcpy_ssse3) IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), __stpcpy_avx2) IFUNC_IMPL_ADD (array, i, stpcpy, @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strcpy_evex) - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), - __strcpy_ssse3) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, (CPU_FEATURE_USABLE (AVX512VL) && CPU_FEATURE_USABLE (AVX512BW)), __strncpy_evex) - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), - __strncpy_ssse3) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2_unaligned) IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S deleted file mode 100644 index d971c2da38..0000000000 --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STPCPY -#define STRCPY __stpcpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S deleted file mode 100644 index 14ed16f6b5..0000000000 --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S +++ /dev/null @@ -1,4 +0,0 @@ -#define USE_AS_STPCPY -#define USE_AS_STRNCPY -#define STRCPY __stpncpy_ssse3 -#include "strcpy-ssse3.S" diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S deleted file mode 100644 index f617a535cf..0000000000 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S +++ /dev/null @@ -1,3550 +0,0 @@ -/* strcpy with SSSE3 - Copyright (C) 2011-2022 Free Software Foundation, Inc. - This file is part of the GNU C Library. - - The GNU C Library is free software; you can redistribute it and/or - modify it under the terms of the GNU Lesser General Public - License as published by the Free Software Foundation; either - version 2.1 of the License, or (at your option) any later version. - - The GNU C Library is distributed in the hope that it will be useful, - but WITHOUT ANY WARRANTY; without even the implied warranty of - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - Lesser General Public License for more details. - - You should have received a copy of the GNU Lesser General Public - License along with the GNU C Library; if not, see - <https://www.gnu.org/licenses/>. */ - -#if IS_IN (libc) - -# ifndef USE_AS_STRCAT -# include <sysdep.h> - -# ifndef STRCPY -# define STRCPY __strcpy_ssse3 -# endif - - .section .text.ssse3,"ax",@progbits -ENTRY (STRCPY) - - mov %rsi, %rcx -# ifdef USE_AS_STRNCPY - mov %RDX_LP, %R8_LP -# endif - mov %rdi, %rdx -# ifdef USE_AS_STRNCPY - test %R8_LP, %R8_LP - jz L(Exit0) - cmp $8, %R8_LP - jbe L(StrncpyExit8Bytes) -# endif - cmpb $0, (%rcx) - jz L(Exit1) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmpb $0, 6(%rcx) - jz L(Exit7) - cmpb $0, 7(%rcx) - jz L(Exit8) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - jb L(StrncpyExit15Bytes) -# endif - cmpb $0, 8(%rcx) - jz L(Exit9) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmpb $0, 13(%rcx) - jz L(Exit14) - cmpb $0, 14(%rcx) - jz L(Exit15) -# ifdef USE_AS_STRNCPY - cmp $16, %r8 - je L(Exit16) -# endif - cmpb $0, 15(%rcx) - jz L(Exit16) -# endif - -# ifdef USE_AS_STRNCPY - mov %rcx, %rsi - sub $16, %r8 - and $0xf, %rsi - -/* add 16 bytes rcx_offset to r8 */ - - add %rsi, %r8 -# endif - lea 16(%rcx), %rsi - and $-16, %rsi - pxor %xmm0, %xmm0 - mov (%rcx), %r9 - mov %r9, (%rdx) - pcmpeqb (%rsi), %xmm0 - mov 8(%rcx), %r9 - mov %r9, 8(%rdx) - -/* convert byte mask in xmm0 to bit mask */ - - pmovmskb %xmm0, %rax - sub %rcx, %rsi - -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - mov %rdx, %rax - lea 16(%rdx), %rdx - and $-16, %rdx - sub %rdx, %rax - -# ifdef USE_AS_STRNCPY - add %rax, %rsi - lea -1(%rsi), %rsi - and $1<<31, %esi - test %rsi, %rsi - jnz L(ContinueCopy) - lea 16(%r8), %r8 - -L(ContinueCopy): -# endif - sub %rax, %rcx - mov %rcx, %rax - and $0xf, %rax - mov $0, %rsi - -/* case: rcx_offset == rdx_offset */ - - jz L(Align16Both) - - cmp $8, %rax - jae L(ShlHigh8) - cmp $1, %rax - je L(Shl1) - cmp $2, %rax - je L(Shl2) - cmp $3, %rax - je L(Shl3) - cmp $4, %rax - je L(Shl4) - cmp $5, %rax - je L(Shl5) - cmp $6, %rax - je L(Shl6) - jmp L(Shl7) - -L(ShlHigh8): - je L(Shl8) - cmp $9, %rax - je L(Shl9) - cmp $10, %rax - je L(Shl10) - cmp $11, %rax - je L(Shl11) - cmp $12, %rax - je L(Shl12) - cmp $13, %rax - je L(Shl13) - cmp $14, %rax - je L(Shl14) - jmp L(Shl15) - -L(Align16Both): - movaps (%rcx), %xmm1 - movaps 16(%rcx), %xmm2 - movaps %xmm1, (%rdx) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm4 - movaps %xmm3, (%rdx, %rsi) - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm1 - movaps %xmm4, (%rdx, %rsi) - pcmpeqb %xmm1, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm2 - movaps %xmm1, (%rdx, %rsi) - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps 16(%rcx, %rsi), %xmm3 - movaps %xmm2, (%rdx, %rsi) - pcmpeqb %xmm3, %xmm0 - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) -# endif - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - movaps %xmm3, (%rdx, %rsi) - mov %rcx, %rax - lea 16(%rcx, %rsi), %rcx - and $-0x40, %rcx - sub %rcx, %rax - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - lea 112(%r8, %rax), %r8 -# endif - mov $-0x40, %rsi - - .p2align 4 -L(Aligned64Loop): - movaps (%rcx), %xmm2 - movaps %xmm2, %xmm4 - movaps 16(%rcx), %xmm5 - movaps 32(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 48(%rcx), %xmm7 - pminub %xmm5, %xmm2 - pminub %xmm7, %xmm3 - pminub %xmm2, %xmm3 - pcmpeqb %xmm0, %xmm3 - pmovmskb %xmm3, %rax - lea 64(%rdx), %rdx - lea 64(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeaveCase2OrCase3) -# endif - test %rax, %rax - jnz L(Aligned64Leave) - movaps %xmm4, -64(%rdx) - movaps %xmm5, -48(%rdx) - movaps %xmm6, -32(%rdx) - movaps %xmm7, -16(%rdx) - jmp L(Aligned64Loop) - -L(Aligned64Leave): -# ifdef USE_AS_STRNCPY - lea 48(%r8), %r8 -# endif - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - test %rax, %rax - lea 16(%rsi), %rsi - jnz L(CopyFrom1To16Bytes) - - movaps %xmm6, -32(%rdx) - pcmpeqb %xmm7, %xmm0 -# ifdef USE_AS_STRNCPY - lea -16(%r8), %r8 -# endif - pmovmskb %xmm0, %rax - lea 16(%rsi), %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl1): - movaps -1(%rcx), %xmm1 - movaps 15(%rcx), %xmm2 -L(Shl1Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit1Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl1LoopExit) - - palignr $1, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 31(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -15(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -1(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl1LoopStart): - movaps 15(%rcx), %xmm2 - movaps 31(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 47(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 63(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $1, %xmm4, %xmm5 - test %rax, %rax - palignr $1, %xmm3, %xmm4 - jnz L(Shl1Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave1) -# endif - palignr $1, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $1, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl1LoopStart) - -L(Shl1LoopExit): - movdqu -1(%rcx), %xmm1 - mov $15, %rsi - movdqu %xmm1, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl2): - movaps -2(%rcx), %xmm1 - movaps 14(%rcx), %xmm2 -L(Shl2Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit2Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl2LoopExit) - - palignr $2, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 30(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -14(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -2(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl2LoopStart): - movaps 14(%rcx), %xmm2 - movaps 30(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 46(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 62(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $2, %xmm4, %xmm5 - test %rax, %rax - palignr $2, %xmm3, %xmm4 - jnz L(Shl2Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave2) -# endif - palignr $2, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $2, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl2LoopStart) - -L(Shl2LoopExit): - movdqu -2(%rcx), %xmm1 - mov $14, %rsi - movdqu %xmm1, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl3): - movaps -3(%rcx), %xmm1 - movaps 13(%rcx), %xmm2 -L(Shl3Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit3Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl3LoopExit) - - palignr $3, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 29(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -13(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -3(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl3LoopStart): - movaps 13(%rcx), %xmm2 - movaps 29(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 45(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 61(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $3, %xmm4, %xmm5 - test %rax, %rax - palignr $3, %xmm3, %xmm4 - jnz L(Shl3Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave3) -# endif - palignr $3, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $3, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl3LoopStart) - -L(Shl3LoopExit): - movdqu -3(%rcx), %xmm1 - mov $13, %rsi - movdqu %xmm1, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl4): - movaps -4(%rcx), %xmm1 - movaps 12(%rcx), %xmm2 -L(Shl4Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit4Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl4LoopExit) - - palignr $4, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 28(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -12(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -4(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl4LoopStart): - movaps 12(%rcx), %xmm2 - movaps 28(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 44(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 60(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $4, %xmm4, %xmm5 - test %rax, %rax - palignr $4, %xmm3, %xmm4 - jnz L(Shl4Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave4) -# endif - palignr $4, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $4, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl4LoopStart) - -L(Shl4LoopExit): - movdqu -4(%rcx), %xmm1 - mov $12, %rsi - movdqu %xmm1, -4(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl5): - movaps -5(%rcx), %xmm1 - movaps 11(%rcx), %xmm2 -L(Shl5Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit5Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl5LoopExit) - - palignr $5, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 27(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -11(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -5(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl5LoopStart): - movaps 11(%rcx), %xmm2 - movaps 27(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 43(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 59(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $5, %xmm4, %xmm5 - test %rax, %rax - palignr $5, %xmm3, %xmm4 - jnz L(Shl5Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave5) -# endif - palignr $5, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $5, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl5LoopStart) - -L(Shl5LoopExit): - movdqu -5(%rcx), %xmm1 - mov $11, %rsi - movdqu %xmm1, -5(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl6): - movaps -6(%rcx), %xmm1 - movaps 10(%rcx), %xmm2 -L(Shl6Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit6Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl6LoopExit) - - palignr $6, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 26(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -10(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -6(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl6LoopStart): - movaps 10(%rcx), %xmm2 - movaps 26(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 42(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 58(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $6, %xmm4, %xmm5 - test %rax, %rax - palignr $6, %xmm3, %xmm4 - jnz L(Shl6Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave6) -# endif - palignr $6, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $6, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl6LoopStart) - -L(Shl6LoopExit): - mov (%rcx), %r9 - mov 6(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 6(%rdx) - mov $10, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl7): - movaps -7(%rcx), %xmm1 - movaps 9(%rcx), %xmm2 -L(Shl7Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit7Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl7LoopExit) - - palignr $7, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 25(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -9(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -7(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl7LoopStart): - movaps 9(%rcx), %xmm2 - movaps 25(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 41(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 57(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $7, %xmm4, %xmm5 - test %rax, %rax - palignr $7, %xmm3, %xmm4 - jnz L(Shl7Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave7) -# endif - palignr $7, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $7, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl7LoopStart) - -L(Shl7LoopExit): - mov (%rcx), %r9 - mov 5(%rcx), %esi - mov %r9, (%rdx) - mov %esi, 5(%rdx) - mov $9, %rsi - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl8): - movaps -8(%rcx), %xmm1 - movaps 8(%rcx), %xmm2 -L(Shl8Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit8Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl8LoopExit) - - palignr $8, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 24(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -8(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -8(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl8LoopStart): - movaps 8(%rcx), %xmm2 - movaps 24(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 40(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 56(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $8, %xmm4, %xmm5 - test %rax, %rax - palignr $8, %xmm3, %xmm4 - jnz L(Shl8Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave8) -# endif - palignr $8, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $8, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl8LoopStart) - -L(Shl8LoopExit): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl9): - movaps -9(%rcx), %xmm1 - movaps 7(%rcx), %xmm2 -L(Shl9Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit9Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl9LoopExit) - - palignr $9, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 23(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -7(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -9(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl9LoopStart): - movaps 7(%rcx), %xmm2 - movaps 23(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 39(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 55(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $9, %xmm4, %xmm5 - test %rax, %rax - palignr $9, %xmm3, %xmm4 - jnz L(Shl9Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave9) -# endif - palignr $9, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $9, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl9LoopStart) - -L(Shl9LoopExit): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl10): - movaps -10(%rcx), %xmm1 - movaps 6(%rcx), %xmm2 -L(Shl10Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit10Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl10LoopExit) - - palignr $10, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 22(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -6(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -10(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl10LoopStart): - movaps 6(%rcx), %xmm2 - movaps 22(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 38(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 54(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $10, %xmm4, %xmm5 - test %rax, %rax - palignr $10, %xmm3, %xmm4 - jnz L(Shl10Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave10) -# endif - palignr $10, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $10, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl10LoopStart) - -L(Shl10LoopExit): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl11): - movaps -11(%rcx), %xmm1 - movaps 5(%rcx), %xmm2 -L(Shl11Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit11Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl11LoopExit) - - palignr $11, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 21(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -5(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -11(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl11LoopStart): - movaps 5(%rcx), %xmm2 - movaps 21(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 37(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 53(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $11, %xmm4, %xmm5 - test %rax, %rax - palignr $11, %xmm3, %xmm4 - jnz L(Shl11Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave11) -# endif - palignr $11, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $11, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl11LoopStart) - -L(Shl11LoopExit): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl12): - movaps -12(%rcx), %xmm1 - movaps 4(%rcx), %xmm2 -L(Shl12Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit12Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl12LoopExit) - - palignr $12, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 20(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -4(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -12(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl12LoopStart): - movaps 4(%rcx), %xmm2 - movaps 20(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 36(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 52(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $12, %xmm4, %xmm5 - test %rax, %rax - palignr $12, %xmm3, %xmm4 - jnz L(Shl12Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave12) -# endif - palignr $12, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $12, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl12LoopStart) - -L(Shl12LoopExit): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl13): - movaps -13(%rcx), %xmm1 - movaps 3(%rcx), %xmm2 -L(Shl13Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit13Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl13LoopExit) - - palignr $13, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 19(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -3(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -13(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl13LoopStart): - movaps 3(%rcx), %xmm2 - movaps 19(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 35(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 51(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $13, %xmm4, %xmm5 - test %rax, %rax - palignr $13, %xmm3, %xmm4 - jnz L(Shl13Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave13) -# endif - palignr $13, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $13, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl13LoopStart) - -L(Shl13LoopExit): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl14): - movaps -14(%rcx), %xmm1 - movaps 2(%rcx), %xmm2 -L(Shl14Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit14Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl14LoopExit) - - palignr $14, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 18(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -2(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -14(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl14LoopStart): - movaps 2(%rcx), %xmm2 - movaps 18(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 34(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 50(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $14, %xmm4, %xmm5 - test %rax, %rax - palignr $14, %xmm3, %xmm4 - jnz L(Shl14Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave14) -# endif - palignr $14, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $14, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl14LoopStart) - -L(Shl14LoopExit): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - jmp L(CopyFrom1To16Bytes) - - .p2align 4 -L(Shl15): - movaps -15(%rcx), %xmm1 - movaps 1(%rcx), %xmm2 -L(Shl15Start): - pcmpeqb %xmm2, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm1 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx - movaps %xmm2, %xmm3 -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - - pcmpeqb %xmm2, %xmm0 - lea 16(%rdx), %rdx - pmovmskb %xmm0, %rax - lea 16(%rcx), %rcx -# ifdef USE_AS_STRNCPY - sub $16, %r8 - jbe L(StrncpyExit15Case2OrCase3) -# endif - test %rax, %rax - jnz L(Shl15LoopExit) - - palignr $15, %xmm3, %xmm2 - movaps %xmm2, (%rdx) - lea 17(%rcx), %rcx - lea 16(%rdx), %rdx - - mov %rcx, %rax - and $-0x40, %rcx - sub %rcx, %rax - lea -1(%rcx), %rcx - sub %rax, %rdx -# ifdef USE_AS_STRNCPY - add %rax, %r8 -# endif - movaps -15(%rcx), %xmm1 - -/* 64 bytes loop */ - .p2align 4 -L(Shl15LoopStart): - movaps 1(%rcx), %xmm2 - movaps 17(%rcx), %xmm3 - movaps %xmm3, %xmm6 - movaps 33(%rcx), %xmm4 - movaps %xmm4, %xmm7 - movaps 49(%rcx), %xmm5 - pminub %xmm2, %xmm6 - pminub %xmm5, %xmm7 - pminub %xmm6, %xmm7 - pcmpeqb %xmm0, %xmm7 - pmovmskb %xmm7, %rax - movaps %xmm5, %xmm7 - palignr $15, %xmm4, %xmm5 - test %rax, %rax - palignr $15, %xmm3, %xmm4 - jnz L(Shl15Start) -# ifdef USE_AS_STRNCPY - sub $64, %r8 - jbe L(StrncpyLeave15) -# endif - palignr $15, %xmm2, %xmm3 - lea 64(%rcx), %rcx - palignr $15, %xmm1, %xmm2 - movaps %xmm7, %xmm1 - movaps %xmm5, 48(%rdx) - movaps %xmm4, 32(%rdx) - movaps %xmm3, 16(%rdx) - movaps %xmm2, (%rdx) - lea 64(%rdx), %rdx - jmp L(Shl15LoopStart) - -L(Shl15LoopExit): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) -# ifdef USE_AS_STRCAT - jmp L(CopyFrom1To16Bytes) -# endif - -# ifndef USE_AS_STRCAT - - .p2align 4 -L(CopyFrom1To16Bytes): -# ifdef USE_AS_STRNCPY - add $16, %r8 -# endif - add %rsi, %rdx - add %rsi, %rcx - - test %al, %al - jz L(ExitHigh) - test $0x01, %al - jnz L(Exit1) - test $0x02, %al - jnz L(Exit2) - test $0x04, %al - jnz L(Exit3) - test $0x08, %al - jnz L(Exit4) - test $0x10, %al - jnz L(Exit5) - test $0x20, %al - jnz L(Exit6) - test $0x40, %al - jnz L(Exit7) - - .p2align 4 -L(Exit8): - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $8, %r8 - lea 8(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(ExitHigh): - test $0x01, %ah - jnz L(Exit9) - test $0x02, %ah - jnz L(Exit10) - test $0x04, %ah - jnz L(Exit11) - test $0x08, %ah - jnz L(Exit12) - test $0x10, %ah - jnz L(Exit13) - test $0x20, %ah - jnz L(Exit14) - test $0x40, %ah - jnz L(Exit15) - - .p2align 4 -L(Exit16): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %rax - mov %rax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 15(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $16, %r8 - lea 16(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - - .p2align 4 -L(CopyFrom1To16BytesCase2): - add $16, %r8 - add %rsi, %rcx - lea (%rsi, %rdx), %rsi - lea -9(%r8), %rdx - and $1<<7, %dh - or %al, %dh - test %dh, %dh - lea (%rsi), %rdx - jz L(ExitHighCase2) - - cmp $1, %r8 - je L(Exit1) - test $0x01, %al - jnz L(Exit1) - cmp $2, %r8 - je L(Exit2) - test $0x02, %al - jnz L(Exit2) - cmp $3, %r8 - je L(Exit3) - test $0x04, %al - jnz L(Exit3) - cmp $4, %r8 - je L(Exit4) - test $0x08, %al - jnz L(Exit4) - cmp $5, %r8 - je L(Exit5) - test $0x10, %al - jnz L(Exit5) - cmp $6, %r8 - je L(Exit6) - test $0x20, %al - jnz L(Exit6) - cmp $7, %r8 - je L(Exit7) - test $0x40, %al - jnz L(Exit7) - jmp L(Exit8) - - .p2align 4 -L(ExitHighCase2): - cmp $9, %r8 - je L(Exit9) - test $0x01, %ah - jnz L(Exit9) - cmp $10, %r8 - je L(Exit10) - test $0x02, %ah - jnz L(Exit10) - cmp $11, %r8 - je L(Exit11) - test $0x04, %ah - jnz L(Exit11) - cmp $12, %r8 - je L(Exit12) - test $0x8, %ah - jnz L(Exit12) - cmp $13, %r8 - je L(Exit13) - test $0x10, %ah - jnz L(Exit13) - cmp $14, %r8 - je L(Exit14) - test $0x20, %ah - jnz L(Exit14) - cmp $15, %r8 - je L(Exit15) - test $0x40, %ah - jnz L(Exit15) - jmp L(Exit16) - -L(CopyFrom1To16BytesCase2OrCase3): - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - - .p2align 4 -L(CopyFrom1To16BytesCase3): - add $16, %r8 - add %rsi, %rdx - add %rsi, %rcx - - cmp $16, %r8 - je L(Exit16) - cmp $8, %r8 - je L(Exit8) - jg L(More8Case3) - cmp $4, %r8 - je L(Exit4) - jg L(More4Case3) - cmp $2, %r8 - jl L(Exit1) - je L(Exit2) - jg L(Exit3) -L(More8Case3): /* but less than 16 */ - cmp $12, %r8 - je L(Exit12) - jl L(Less12Case3) - cmp $14, %r8 - jl L(Exit13) - je L(Exit14) - jg L(Exit15) -L(More4Case3): /* but less than 8 */ - cmp $6, %r8 - jl L(Exit5) - je L(Exit6) - jg L(Exit7) -L(Less12Case3): /* but more than 8 */ - cmp $10, %r8 - jl L(Exit9) - je L(Exit10) - jg L(Exit11) -# endif - - .p2align 4 -L(Exit1): - movb (%rcx), %al - movb %al, (%rdx) -# ifdef USE_AS_STPCPY - lea (%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $1, %r8 - lea 1(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit2): - movw (%rcx), %ax - movw %ax, (%rdx) -# ifdef USE_AS_STPCPY - lea 1(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $2, %r8 - lea 2(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit3): - movw (%rcx), %ax - movw %ax, (%rdx) - movb 2(%rcx), %al - movb %al, 2(%rdx) -# ifdef USE_AS_STPCPY - lea 2(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $3, %r8 - lea 3(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit4): - movl (%rcx), %eax - movl %eax, (%rdx) -# ifdef USE_AS_STPCPY - lea 3(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $4, %r8 - lea 4(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit5): - movl (%rcx), %eax - movl %eax, (%rdx) - movb 4(%rcx), %al - movb %al, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 4(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $5, %r8 - lea 5(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit6): - movl (%rcx), %eax - movl %eax, (%rdx) - movw 4(%rcx), %ax - movw %ax, 4(%rdx) -# ifdef USE_AS_STPCPY - lea 5(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $6, %r8 - lea 6(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit7): - movl (%rcx), %eax - movl %eax, (%rdx) - movl 3(%rcx), %eax - movl %eax, 3(%rdx) -# ifdef USE_AS_STPCPY - lea 6(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $7, %r8 - lea 7(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit9): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %eax - mov %eax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 8(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $9, %r8 - lea 9(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit10): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %eax - mov %eax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 9(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $10, %r8 - lea 10(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit11): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %eax - mov %eax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 10(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $11, %r8 - lea 11(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit12): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 8(%rcx), %eax - mov %eax, 8(%rdx) -# ifdef USE_AS_STPCPY - lea 11(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $12, %r8 - lea 12(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit13): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 5(%rcx), %rax - mov %rax, 5(%rdx) -# ifdef USE_AS_STPCPY - lea 12(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $13, %r8 - lea 13(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit14): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 6(%rcx), %rax - mov %rax, 6(%rdx) -# ifdef USE_AS_STPCPY - lea 13(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $14, %r8 - lea 14(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - - .p2align 4 -L(Exit15): - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax -# else - mov %rdi, %rax -# endif -# ifdef USE_AS_STRNCPY - sub $15, %r8 - lea 15(%rdx), %rcx - jnz L(StrncpyFillTailWithZero1) -# ifdef USE_AS_STPCPY - cmpb $1, (%rax) - sbb $-1, %rax -# endif -# endif - ret - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(Fill0): - ret - - .p2align 4 -L(Fill1): - movb %dl, (%rcx) - ret - - .p2align 4 -L(Fill2): - movw %dx, (%rcx) - ret - - .p2align 4 -L(Fill3): - movw %dx, (%rcx) - movb %dl, 2(%rcx) - ret - - .p2align 4 -L(Fill4): - movl %edx, (%rcx) - ret - - .p2align 4 -L(Fill5): - movl %edx, (%rcx) - movb %dl, 4(%rcx) - ret - - .p2align 4 -L(Fill6): - movl %edx, (%rcx) - movw %dx, 4(%rcx) - ret - - .p2align 4 -L(Fill7): - movl %edx, (%rcx) - movl %edx, 3(%rcx) - ret - - .p2align 4 -L(Fill8): - mov %rdx, (%rcx) - ret - - .p2align 4 -L(Fill9): - mov %rdx, (%rcx) - movb %dl, 8(%rcx) - ret - - .p2align 4 -L(Fill10): - mov %rdx, (%rcx) - movw %dx, 8(%rcx) - ret - - .p2align 4 -L(Fill11): - mov %rdx, (%rcx) - movl %edx, 7(%rcx) - ret - - .p2align 4 -L(Fill12): - mov %rdx, (%rcx) - movl %edx, 8(%rcx) - ret - - .p2align 4 -L(Fill13): - mov %rdx, (%rcx) - mov %rdx, 5(%rcx) - ret - - .p2align 4 -L(Fill14): - mov %rdx, (%rcx) - mov %rdx, 6(%rcx) - ret - - .p2align 4 -L(Fill15): - mov %rdx, (%rcx) - mov %rdx, 7(%rcx) - ret - - .p2align 4 -L(Fill16): - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - ret - - .p2align 4 -L(StrncpyFillExit1): - lea 16(%r8), %r8 -L(FillFrom1To16Bytes): - test %r8, %r8 - jz L(Fill0) - cmp $16, %r8 - je L(Fill16) - cmp $8, %r8 - je L(Fill8) - jg L(FillMore8) - cmp $4, %r8 - je L(Fill4) - jg L(FillMore4) - cmp $2, %r8 - jl L(Fill1) - je L(Fill2) - jg L(Fill3) -L(FillMore8): /* but less than 16 */ - cmp $12, %r8 - je L(Fill12) - jl L(FillLess12) - cmp $14, %r8 - jl L(Fill13) - je L(Fill14) - jg L(Fill15) -L(FillMore4): /* but less than 8 */ - cmp $6, %r8 - jl L(Fill5) - je L(Fill6) - jg L(Fill7) -L(FillLess12): /* but more than 8 */ - cmp $10, %r8 - jl L(Fill9) - je L(Fill10) - jmp L(Fill11) - - .p2align 4 -L(StrncpyFillTailWithZero1): - xor %rdx, %rdx - sub $16, %r8 - jbe L(StrncpyFillExit1) - - pxor %xmm0, %xmm0 - mov %rdx, (%rcx) - mov %rdx, 8(%rcx) - - lea 16(%rcx), %rcx - - mov %rcx, %rdx - and $0xf, %rdx - sub %rdx, %rcx - add %rdx, %r8 - xor %rdx, %rdx - sub $64, %r8 - jb L(StrncpyFillLess64) - -L(StrncpyFillLoopMovdqa): - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - movdqa %xmm0, 32(%rcx) - movdqa %xmm0, 48(%rcx) - lea 64(%rcx), %rcx - sub $64, %r8 - jae L(StrncpyFillLoopMovdqa) - -L(StrncpyFillLess64): - add $32, %r8 - jl L(StrncpyFillLess32) - movdqa %xmm0, (%rcx) - movdqa %xmm0, 16(%rcx) - lea 32(%rcx), %rcx - sub $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - -L(StrncpyFillLess32): - add $16, %r8 - jl L(StrncpyFillExit1) - movdqa %xmm0, (%rcx) - lea 16(%rcx), %rcx - jmp L(FillFrom1To16Bytes) - - .p2align 4 -L(Exit0): - mov %rdx, %rax - ret - - .p2align 4 -L(StrncpyExit15Bytes): - cmp $9, %r8 - je L(Exit9) - cmpb $0, 8(%rcx) - jz L(Exit9) - cmp $10, %r8 - je L(Exit10) - cmpb $0, 9(%rcx) - jz L(Exit10) - cmp $11, %r8 - je L(Exit11) - cmpb $0, 10(%rcx) - jz L(Exit11) - cmp $12, %r8 - je L(Exit12) - cmpb $0, 11(%rcx) - jz L(Exit12) - cmp $13, %r8 - je L(Exit13) - cmpb $0, 12(%rcx) - jz L(Exit13) - cmp $14, %r8 - je L(Exit14) - cmpb $0, 13(%rcx) - jz L(Exit14) - mov (%rcx), %rax - mov %rax, (%rdx) - mov 7(%rcx), %rax - mov %rax, 7(%rdx) -# ifdef USE_AS_STPCPY - lea 14(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - - .p2align 4 -L(StrncpyExit8Bytes): - cmp $1, %r8 - je L(Exit1) - cmpb $0, (%rcx) - jz L(Exit1) - cmp $2, %r8 - je L(Exit2) - cmpb $0, 1(%rcx) - jz L(Exit2) - cmp $3, %r8 - je L(Exit3) - cmpb $0, 2(%rcx) - jz L(Exit3) - cmp $4, %r8 - je L(Exit4) - cmpb $0, 3(%rcx) - jz L(Exit4) - cmp $5, %r8 - je L(Exit5) - cmpb $0, 4(%rcx) - jz L(Exit5) - cmp $6, %r8 - je L(Exit6) - cmpb $0, 5(%rcx) - jz L(Exit6) - cmp $7, %r8 - je L(Exit7) - cmpb $0, 6(%rcx) - jz L(Exit7) - mov (%rcx), %rax - mov %rax, (%rdx) -# ifdef USE_AS_STPCPY - lea 7(%rdx), %rax - cmpb $1, (%rax) - sbb $-1, %rax -# else - mov %rdi, %rax -# endif - ret - -# endif -# endif - -# ifdef USE_AS_STRNCPY - .p2align 4 -L(StrncpyLeaveCase2OrCase3): - test %rax, %rax - jnz L(Aligned64LeaveCase2) - -L(Aligned64LeaveCase3): - lea 64(%r8), %r8 - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase3) - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase3) - -L(Aligned64LeaveCase2): - pcmpeqb %xmm4, %xmm0 - pmovmskb %xmm0, %rax - add $48, %r8 - jle L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm5, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm4, -64(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm6, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm5, -48(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(CopyFrom1To16BytesCase2OrCase3) - test %rax, %rax - jnz L(CopyFrom1To16Bytes) - - pcmpeqb %xmm7, %xmm0 - pmovmskb %xmm0, %rax - movaps %xmm6, -32(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - jmp L(CopyFrom1To16BytesCase2) -/*--------------------------------------------------*/ - .p2align 4 -L(StrncpyExit1Case2OrCase3): - movdqu -1(%rcx), %xmm0 - movdqu %xmm0, -1(%rdx) - mov $15, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit2Case2OrCase3): - movdqu -2(%rcx), %xmm0 - movdqu %xmm0, -2(%rdx) - mov $14, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit3Case2OrCase3): - movdqu -3(%rcx), %xmm0 - movdqu %xmm0, -3(%rdx) - mov $13, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit4Case2OrCase3): - movdqu -4(%rcx), %xmm0 - movdqu %xmm0, -4(%rdx) - mov $12, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit5Case2OrCase3): - movdqu -5(%rcx), %xmm0 - movdqu %xmm0, -5(%rdx) - mov $11, %rsi - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit6Case2OrCase3): - mov (%rcx), %rsi - mov 6(%rcx), %r9d - mov %r9d, 6(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $10, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit7Case2OrCase3): - mov (%rcx), %rsi - mov 5(%rcx), %r9d - mov %r9d, 5(%rdx) - mov %rsi, (%rdx) - test %rax, %rax - mov $9, %rsi - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit8Case2OrCase3): - mov (%rcx), %r9 - mov $8, %rsi - mov %r9, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit9Case2OrCase3): - mov -1(%rcx), %r9 - mov $7, %rsi - mov %r9, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit10Case2OrCase3): - mov -2(%rcx), %r9 - mov $6, %rsi - mov %r9, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit11Case2OrCase3): - mov -3(%rcx), %r9 - mov $5, %rsi - mov %r9, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit12Case2OrCase3): - mov (%rcx), %r9d - mov $4, %rsi - mov %r9d, (%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit13Case2OrCase3): - mov -1(%rcx), %r9d - mov $3, %rsi - mov %r9d, -1(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit14Case2OrCase3): - mov -2(%rcx), %r9d - mov $2, %rsi - mov %r9d, -2(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyExit15Case2OrCase3): - mov -3(%rcx), %r9d - mov $1, %rsi - mov %r9d, -3(%rdx) - test %rax, %rax - jnz L(CopyFrom1To16BytesCase2) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave1): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit1) - palignr $1, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 31(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - palignr $1, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit1) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit1): - lea 15(%rdx, %rsi), %rdx - lea 15(%rcx, %rsi), %rcx - mov -15(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -15(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave2): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit2) - palignr $2, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 30(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - palignr $2, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit2) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit2): - lea 14(%rdx, %rsi), %rdx - lea 14(%rcx, %rsi), %rcx - mov -14(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -14(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave3): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit3) - palignr $3, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 29(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - palignr $3, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit3) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit3): - lea 13(%rdx, %rsi), %rdx - lea 13(%rcx, %rsi), %rcx - mov -13(%rcx), %rsi - mov -8(%rcx), %rax - mov %rsi, -13(%rdx) - mov %rax, -8(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave4): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit4) - palignr $4, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 28(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - palignr $4, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit4) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit4): - lea 12(%rdx, %rsi), %rdx - lea 12(%rcx, %rsi), %rcx - mov -12(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -12(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave5): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit5) - palignr $5, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 27(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - palignr $5, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit5) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit5): - lea 11(%rdx, %rsi), %rdx - lea 11(%rcx, %rsi), %rcx - mov -11(%rcx), %rsi - mov -4(%rcx), %eax - mov %rsi, -11(%rdx) - mov %eax, -4(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave6): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit6) - palignr $6, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 26(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - palignr $6, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit6) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit6): - lea 10(%rdx, %rsi), %rdx - lea 10(%rcx, %rsi), %rcx - mov -10(%rcx), %rsi - movw -2(%rcx), %ax - mov %rsi, -10(%rdx) - movw %ax, -2(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave7): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit7) - palignr $7, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 25(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - palignr $7, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit7) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit7): - lea 9(%rdx, %rsi), %rdx - lea 9(%rcx, %rsi), %rcx - mov -9(%rcx), %rsi - movb -1(%rcx), %ah - mov %rsi, -9(%rdx) - movb %ah, -1(%rdx) - xor %rsi, %rsi - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave8): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit8) - palignr $8, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 24(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - palignr $8, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit8) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit8): - lea 8(%rdx, %rsi), %rdx - lea 8(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave9): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit9) - palignr $9, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 23(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - palignr $9, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit9) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit9): - lea 7(%rdx, %rsi), %rdx - lea 7(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave10): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit10) - palignr $10, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 22(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - palignr $10, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit10) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit10): - lea 6(%rdx, %rsi), %rdx - lea 6(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave11): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit11) - palignr $11, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 21(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - palignr $11, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit11) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit11): - lea 5(%rdx, %rsi), %rdx - lea 5(%rcx, %rsi), %rcx - mov -8(%rcx), %rax - xor %rsi, %rsi - mov %rax, -8(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave12): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit12) - palignr $12, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 20(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - palignr $12, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit12) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit12): - lea 4(%rdx, %rsi), %rdx - lea 4(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave13): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit13) - palignr $13, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 19(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - palignr $13, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit13) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit13): - lea 3(%rdx, %rsi), %rdx - lea 3(%rcx, %rsi), %rcx - mov -4(%rcx), %eax - xor %rsi, %rsi - mov %eax, -4(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave14): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit14) - palignr $14, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 18(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - palignr $14, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit14) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit14): - lea 2(%rdx, %rsi), %rdx - lea 2(%rcx, %rsi), %rcx - movw -2(%rcx), %ax - xor %rsi, %rsi - movw %ax, -2(%rdx) - jmp L(CopyFrom1To16BytesCase3) - - .p2align 4 -L(StrncpyLeave15): - movaps %xmm2, %xmm3 - add $48, %r8 - jle L(StrncpyExit15) - palignr $15, %xmm1, %xmm2 - movaps %xmm2, (%rdx) - movaps 17(%rcx), %xmm2 - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - palignr $15, %xmm3, %xmm2 - movaps %xmm2, 16(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm4, 32(%rdx) - lea 16(%rsi), %rsi - sub $16, %r8 - jbe L(StrncpyExit15) - movaps %xmm5, 48(%rdx) - lea 16(%rsi), %rsi - lea -16(%r8), %r8 - -L(StrncpyExit15): - lea 1(%rdx, %rsi), %rdx - lea 1(%rcx, %rsi), %rcx - movb -1(%rcx), %ah - xor %rsi, %rsi - movb %ah, -1(%rdx) - jmp L(CopyFrom1To16BytesCase3) - -# endif -# ifndef USE_AS_STRCAT -END (STRCPY) -# endif -#endif diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S deleted file mode 100644 index bf82ee447d..0000000000 --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S +++ /dev/null @@ -1,3 +0,0 @@ -#define USE_AS_STRNCPY -#define STRCPY __strncpy_ssse3 -#include "strcpy-ssse3.S" -- 2.25.1 ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein @ 2022-03-25 19:57 ` H.J. Lu 0 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 4 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 - > sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 - > sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 - > sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 -------------------- > sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 - > 6 files changed, 3572 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index a2ebc06c5f..292353bad7 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -42,13 +42,11 @@ sysdep_routines += \ > stpcpy-evex \ > stpcpy-sse2 \ > stpcpy-sse2-unaligned \ > - stpcpy-ssse3 \ > stpncpy-avx2 \ > stpncpy-avx2-rtm \ > stpncpy-c \ > stpncpy-evex \ > stpncpy-sse2-unaligned \ > - stpncpy-ssse3 \ > strcasecmp_l-avx2 \ > strcasecmp_l-avx2-rtm \ > strcasecmp_l-evex \ > @@ -79,7 +77,6 @@ sysdep_routines += \ > strcpy-evex \ > strcpy-sse2 \ > strcpy-sse2-unaligned \ > - strcpy-ssse3 \ > strcspn-c \ > strcspn-sse2 \ > strlen-avx2 \ > @@ -106,7 +103,6 @@ sysdep_routines += \ > strncpy-c \ > strncpy-evex \ > strncpy-sse2-unaligned \ > - strncpy-ssse3 \ > strnlen-avx2 \ > strnlen-avx2-rtm \ > strnlen-evex \ > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 4133ed7e43..505b8002e0 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ > IFUNC_IMPL (i, name, stpncpy, > - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3), > - __stpncpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), > __stpncpy_avx2) > IFUNC_IMPL_ADD (array, i, stpncpy, > @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > > /* Support sysdeps/x86_64/multiarch/stpcpy.c. */ > IFUNC_IMPL (i, name, stpcpy, > - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3), > - __stpcpy_ssse3) > IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), > __stpcpy_avx2) > IFUNC_IMPL_ADD (array, i, stpcpy, > @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strcpy_evex) > - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), > - __strcpy_ssse3) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2)) > > @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > (CPU_FEATURE_USABLE (AVX512VL) > && CPU_FEATURE_USABLE (AVX512BW)), > __strncpy_evex) > - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), > - __strncpy_ssse3) > IFUNC_IMPL_ADD (array, i, strncpy, 1, > __strncpy_sse2_unaligned) > IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2)) > diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > deleted file mode 100644 > index d971c2da38..0000000000 > --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STPCPY > -#define STRCPY __stpcpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > deleted file mode 100644 > index 14ed16f6b5..0000000000 > --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_STPCPY > -#define USE_AS_STRNCPY > -#define STRCPY __stpncpy_ssse3 > -#include "strcpy-ssse3.S" > diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S > deleted file mode 100644 > index f617a535cf..0000000000 > --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S > +++ /dev/null > @@ -1,3550 +0,0 @@ > -/* strcpy with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# ifndef USE_AS_STRCAT > -# include <sysdep.h> > - > -# ifndef STRCPY > -# define STRCPY __strcpy_ssse3 > -# endif > - > - .section .text.ssse3,"ax",@progbits > -ENTRY (STRCPY) > - > - mov %rsi, %rcx > -# ifdef USE_AS_STRNCPY > - mov %RDX_LP, %R8_LP > -# endif > - mov %rdi, %rdx > -# ifdef USE_AS_STRNCPY > - test %R8_LP, %R8_LP > - jz L(Exit0) > - cmp $8, %R8_LP > - jbe L(StrncpyExit8Bytes) > -# endif > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - cmpb $0, 7(%rcx) > - jz L(Exit8) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - jb L(StrncpyExit15Bytes) > -# endif > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - cmpb $0, 14(%rcx) > - jz L(Exit15) > -# ifdef USE_AS_STRNCPY > - cmp $16, %r8 > - je L(Exit16) > -# endif > - cmpb $0, 15(%rcx) > - jz L(Exit16) > -# endif > - > -# ifdef USE_AS_STRNCPY > - mov %rcx, %rsi > - sub $16, %r8 > - and $0xf, %rsi > - > -/* add 16 bytes rcx_offset to r8 */ > - > - add %rsi, %r8 > -# endif > - lea 16(%rcx), %rsi > - and $-16, %rsi > - pxor %xmm0, %xmm0 > - mov (%rcx), %r9 > - mov %r9, (%rdx) > - pcmpeqb (%rsi), %xmm0 > - mov 8(%rcx), %r9 > - mov %r9, 8(%rdx) > - > -/* convert byte mask in xmm0 to bit mask */ > - > - pmovmskb %xmm0, %rax > - sub %rcx, %rsi > - > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - mov %rdx, %rax > - lea 16(%rdx), %rdx > - and $-16, %rdx > - sub %rdx, %rax > - > -# ifdef USE_AS_STRNCPY > - add %rax, %rsi > - lea -1(%rsi), %rsi > - and $1<<31, %esi > - test %rsi, %rsi > - jnz L(ContinueCopy) > - lea 16(%r8), %r8 > - > -L(ContinueCopy): > -# endif > - sub %rax, %rcx > - mov %rcx, %rax > - and $0xf, %rax > - mov $0, %rsi > - > -/* case: rcx_offset == rdx_offset */ > - > - jz L(Align16Both) > - > - cmp $8, %rax > - jae L(ShlHigh8) > - cmp $1, %rax > - je L(Shl1) > - cmp $2, %rax > - je L(Shl2) > - cmp $3, %rax > - je L(Shl3) > - cmp $4, %rax > - je L(Shl4) > - cmp $5, %rax > - je L(Shl5) > - cmp $6, %rax > - je L(Shl6) > - jmp L(Shl7) > - > -L(ShlHigh8): > - je L(Shl8) > - cmp $9, %rax > - je L(Shl9) > - cmp $10, %rax > - je L(Shl10) > - cmp $11, %rax > - je L(Shl11) > - cmp $12, %rax > - je L(Shl12) > - cmp $13, %rax > - je L(Shl13) > - cmp $14, %rax > - je L(Shl14) > - jmp L(Shl15) > - > -L(Align16Both): > - movaps (%rcx), %xmm1 > - movaps 16(%rcx), %xmm2 > - movaps %xmm1, (%rdx) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm4 > - movaps %xmm3, (%rdx, %rsi) > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm1 > - movaps %xmm4, (%rdx, %rsi) > - pcmpeqb %xmm1, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm2 > - movaps %xmm1, (%rdx, %rsi) > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps 16(%rcx, %rsi), %xmm3 > - movaps %xmm2, (%rdx, %rsi) > - pcmpeqb %xmm3, %xmm0 > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm3, (%rdx, %rsi) > - mov %rcx, %rax > - lea 16(%rcx, %rsi), %rcx > - and $-0x40, %rcx > - sub %rcx, %rax > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - lea 112(%r8, %rax), %r8 > -# endif > - mov $-0x40, %rsi > - > - .p2align 4 > -L(Aligned64Loop): > - movaps (%rcx), %xmm2 > - movaps %xmm2, %xmm4 > - movaps 16(%rcx), %xmm5 > - movaps 32(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 48(%rcx), %xmm7 > - pminub %xmm5, %xmm2 > - pminub %xmm7, %xmm3 > - pminub %xmm2, %xmm3 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %rax > - lea 64(%rdx), %rdx > - lea 64(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeaveCase2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Aligned64Leave) > - movaps %xmm4, -64(%rdx) > - movaps %xmm5, -48(%rdx) > - movaps %xmm6, -32(%rdx) > - movaps %xmm7, -16(%rdx) > - jmp L(Aligned64Loop) > - > -L(Aligned64Leave): > -# ifdef USE_AS_STRNCPY > - lea 48(%r8), %r8 > -# endif > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - test %rax, %rax > - lea 16(%rsi), %rsi > - jnz L(CopyFrom1To16Bytes) > - > - movaps %xmm6, -32(%rdx) > - pcmpeqb %xmm7, %xmm0 > -# ifdef USE_AS_STRNCPY > - lea -16(%r8), %r8 > -# endif > - pmovmskb %xmm0, %rax > - lea 16(%rsi), %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl1): > - movaps -1(%rcx), %xmm1 > - movaps 15(%rcx), %xmm2 > -L(Shl1Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit1Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl1LoopExit) > - > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 31(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -15(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -1(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl1LoopStart): > - movaps 15(%rcx), %xmm2 > - movaps 31(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 47(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 63(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $1, %xmm4, %xmm5 > - test %rax, %rax > - palignr $1, %xmm3, %xmm4 > - jnz L(Shl1Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave1) > -# endif > - palignr $1, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $1, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl1LoopStart) > - > -L(Shl1LoopExit): > - movdqu -1(%rcx), %xmm1 > - mov $15, %rsi > - movdqu %xmm1, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl2): > - movaps -2(%rcx), %xmm1 > - movaps 14(%rcx), %xmm2 > -L(Shl2Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit2Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl2LoopExit) > - > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 30(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -14(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -2(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl2LoopStart): > - movaps 14(%rcx), %xmm2 > - movaps 30(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 46(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 62(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $2, %xmm4, %xmm5 > - test %rax, %rax > - palignr $2, %xmm3, %xmm4 > - jnz L(Shl2Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave2) > -# endif > - palignr $2, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $2, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl2LoopStart) > - > -L(Shl2LoopExit): > - movdqu -2(%rcx), %xmm1 > - mov $14, %rsi > - movdqu %xmm1, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl3): > - movaps -3(%rcx), %xmm1 > - movaps 13(%rcx), %xmm2 > -L(Shl3Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit3Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl3LoopExit) > - > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 29(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -13(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -3(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl3LoopStart): > - movaps 13(%rcx), %xmm2 > - movaps 29(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 45(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 61(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $3, %xmm4, %xmm5 > - test %rax, %rax > - palignr $3, %xmm3, %xmm4 > - jnz L(Shl3Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave3) > -# endif > - palignr $3, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $3, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl3LoopStart) > - > -L(Shl3LoopExit): > - movdqu -3(%rcx), %xmm1 > - mov $13, %rsi > - movdqu %xmm1, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl4): > - movaps -4(%rcx), %xmm1 > - movaps 12(%rcx), %xmm2 > -L(Shl4Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit4Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl4LoopExit) > - > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 28(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -12(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -4(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl4LoopStart): > - movaps 12(%rcx), %xmm2 > - movaps 28(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 44(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 60(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $4, %xmm4, %xmm5 > - test %rax, %rax > - palignr $4, %xmm3, %xmm4 > - jnz L(Shl4Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave4) > -# endif > - palignr $4, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $4, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl4LoopStart) > - > -L(Shl4LoopExit): > - movdqu -4(%rcx), %xmm1 > - mov $12, %rsi > - movdqu %xmm1, -4(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl5): > - movaps -5(%rcx), %xmm1 > - movaps 11(%rcx), %xmm2 > -L(Shl5Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit5Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl5LoopExit) > - > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 27(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -11(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -5(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl5LoopStart): > - movaps 11(%rcx), %xmm2 > - movaps 27(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 43(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 59(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $5, %xmm4, %xmm5 > - test %rax, %rax > - palignr $5, %xmm3, %xmm4 > - jnz L(Shl5Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave5) > -# endif > - palignr $5, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $5, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl5LoopStart) > - > -L(Shl5LoopExit): > - movdqu -5(%rcx), %xmm1 > - mov $11, %rsi > - movdqu %xmm1, -5(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl6): > - movaps -6(%rcx), %xmm1 > - movaps 10(%rcx), %xmm2 > -L(Shl6Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit6Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl6LoopExit) > - > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 26(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -10(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -6(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl6LoopStart): > - movaps 10(%rcx), %xmm2 > - movaps 26(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 42(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 58(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $6, %xmm4, %xmm5 > - test %rax, %rax > - palignr $6, %xmm3, %xmm4 > - jnz L(Shl6Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave6) > -# endif > - palignr $6, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $6, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl6LoopStart) > - > -L(Shl6LoopExit): > - mov (%rcx), %r9 > - mov 6(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 6(%rdx) > - mov $10, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl7): > - movaps -7(%rcx), %xmm1 > - movaps 9(%rcx), %xmm2 > -L(Shl7Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit7Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl7LoopExit) > - > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 25(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -9(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -7(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl7LoopStart): > - movaps 9(%rcx), %xmm2 > - movaps 25(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 41(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 57(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $7, %xmm4, %xmm5 > - test %rax, %rax > - palignr $7, %xmm3, %xmm4 > - jnz L(Shl7Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave7) > -# endif > - palignr $7, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $7, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl7LoopStart) > - > -L(Shl7LoopExit): > - mov (%rcx), %r9 > - mov 5(%rcx), %esi > - mov %r9, (%rdx) > - mov %esi, 5(%rdx) > - mov $9, %rsi > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl8): > - movaps -8(%rcx), %xmm1 > - movaps 8(%rcx), %xmm2 > -L(Shl8Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit8Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl8LoopExit) > - > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 24(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -8(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -8(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl8LoopStart): > - movaps 8(%rcx), %xmm2 > - movaps 24(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 40(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 56(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $8, %xmm4, %xmm5 > - test %rax, %rax > - palignr $8, %xmm3, %xmm4 > - jnz L(Shl8Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave8) > -# endif > - palignr $8, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $8, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl8LoopStart) > - > -L(Shl8LoopExit): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl9): > - movaps -9(%rcx), %xmm1 > - movaps 7(%rcx), %xmm2 > -L(Shl9Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit9Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl9LoopExit) > - > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 23(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -7(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -9(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl9LoopStart): > - movaps 7(%rcx), %xmm2 > - movaps 23(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 39(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 55(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $9, %xmm4, %xmm5 > - test %rax, %rax > - palignr $9, %xmm3, %xmm4 > - jnz L(Shl9Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave9) > -# endif > - palignr $9, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $9, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl9LoopStart) > - > -L(Shl9LoopExit): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl10): > - movaps -10(%rcx), %xmm1 > - movaps 6(%rcx), %xmm2 > -L(Shl10Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit10Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl10LoopExit) > - > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 22(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -6(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -10(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl10LoopStart): > - movaps 6(%rcx), %xmm2 > - movaps 22(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 38(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 54(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $10, %xmm4, %xmm5 > - test %rax, %rax > - palignr $10, %xmm3, %xmm4 > - jnz L(Shl10Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave10) > -# endif > - palignr $10, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $10, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl10LoopStart) > - > -L(Shl10LoopExit): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl11): > - movaps -11(%rcx), %xmm1 > - movaps 5(%rcx), %xmm2 > -L(Shl11Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit11Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl11LoopExit) > - > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 21(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -5(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -11(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl11LoopStart): > - movaps 5(%rcx), %xmm2 > - movaps 21(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 37(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 53(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $11, %xmm4, %xmm5 > - test %rax, %rax > - palignr $11, %xmm3, %xmm4 > - jnz L(Shl11Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave11) > -# endif > - palignr $11, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $11, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl11LoopStart) > - > -L(Shl11LoopExit): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl12): > - movaps -12(%rcx), %xmm1 > - movaps 4(%rcx), %xmm2 > -L(Shl12Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit12Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl12LoopExit) > - > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 20(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -4(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -12(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl12LoopStart): > - movaps 4(%rcx), %xmm2 > - movaps 20(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 36(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 52(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $12, %xmm4, %xmm5 > - test %rax, %rax > - palignr $12, %xmm3, %xmm4 > - jnz L(Shl12Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave12) > -# endif > - palignr $12, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $12, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl12LoopStart) > - > -L(Shl12LoopExit): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl13): > - movaps -13(%rcx), %xmm1 > - movaps 3(%rcx), %xmm2 > -L(Shl13Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit13Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl13LoopExit) > - > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 19(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -3(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -13(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl13LoopStart): > - movaps 3(%rcx), %xmm2 > - movaps 19(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 35(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 51(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $13, %xmm4, %xmm5 > - test %rax, %rax > - palignr $13, %xmm3, %xmm4 > - jnz L(Shl13Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave13) > -# endif > - palignr $13, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $13, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl13LoopStart) > - > -L(Shl13LoopExit): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl14): > - movaps -14(%rcx), %xmm1 > - movaps 2(%rcx), %xmm2 > -L(Shl14Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit14Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl14LoopExit) > - > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 18(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -2(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -14(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl14LoopStart): > - movaps 2(%rcx), %xmm2 > - movaps 18(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 34(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 50(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $14, %xmm4, %xmm5 > - test %rax, %rax > - palignr $14, %xmm3, %xmm4 > - jnz L(Shl14Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave14) > -# endif > - palignr $14, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $14, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl14LoopStart) > - > -L(Shl14LoopExit): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - jmp L(CopyFrom1To16Bytes) > - > - .p2align 4 > -L(Shl15): > - movaps -15(%rcx), %xmm1 > - movaps 1(%rcx), %xmm2 > -L(Shl15Start): > - pcmpeqb %xmm2, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm1 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > - movaps %xmm2, %xmm3 > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - > - pcmpeqb %xmm2, %xmm0 > - lea 16(%rdx), %rdx > - pmovmskb %xmm0, %rax > - lea 16(%rcx), %rcx > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - jbe L(StrncpyExit15Case2OrCase3) > -# endif > - test %rax, %rax > - jnz L(Shl15LoopExit) > - > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, (%rdx) > - lea 17(%rcx), %rcx > - lea 16(%rdx), %rdx > - > - mov %rcx, %rax > - and $-0x40, %rcx > - sub %rcx, %rax > - lea -1(%rcx), %rcx > - sub %rax, %rdx > -# ifdef USE_AS_STRNCPY > - add %rax, %r8 > -# endif > - movaps -15(%rcx), %xmm1 > - > -/* 64 bytes loop */ > - .p2align 4 > -L(Shl15LoopStart): > - movaps 1(%rcx), %xmm2 > - movaps 17(%rcx), %xmm3 > - movaps %xmm3, %xmm6 > - movaps 33(%rcx), %xmm4 > - movaps %xmm4, %xmm7 > - movaps 49(%rcx), %xmm5 > - pminub %xmm2, %xmm6 > - pminub %xmm5, %xmm7 > - pminub %xmm6, %xmm7 > - pcmpeqb %xmm0, %xmm7 > - pmovmskb %xmm7, %rax > - movaps %xmm5, %xmm7 > - palignr $15, %xmm4, %xmm5 > - test %rax, %rax > - palignr $15, %xmm3, %xmm4 > - jnz L(Shl15Start) > -# ifdef USE_AS_STRNCPY > - sub $64, %r8 > - jbe L(StrncpyLeave15) > -# endif > - palignr $15, %xmm2, %xmm3 > - lea 64(%rcx), %rcx > - palignr $15, %xmm1, %xmm2 > - movaps %xmm7, %xmm1 > - movaps %xmm5, 48(%rdx) > - movaps %xmm4, 32(%rdx) > - movaps %xmm3, 16(%rdx) > - movaps %xmm2, (%rdx) > - lea 64(%rdx), %rdx > - jmp L(Shl15LoopStart) > - > -L(Shl15LoopExit): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > -# ifdef USE_AS_STRCAT > - jmp L(CopyFrom1To16Bytes) > -# endif > - > -# ifndef USE_AS_STRCAT > - > - .p2align 4 > -L(CopyFrom1To16Bytes): > -# ifdef USE_AS_STRNCPY > - add $16, %r8 > -# endif > - add %rsi, %rdx > - add %rsi, %rcx > - > - test %al, %al > - jz L(ExitHigh) > - test $0x01, %al > - jnz L(Exit1) > - test $0x02, %al > - jnz L(Exit2) > - test $0x04, %al > - jnz L(Exit3) > - test $0x08, %al > - jnz L(Exit4) > - test $0x10, %al > - jnz L(Exit5) > - test $0x20, %al > - jnz L(Exit6) > - test $0x40, %al > - jnz L(Exit7) > - > - .p2align 4 > -L(Exit8): > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $8, %r8 > - lea 8(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(ExitHigh): > - test $0x01, %ah > - jnz L(Exit9) > - test $0x02, %ah > - jnz L(Exit10) > - test $0x04, %ah > - jnz L(Exit11) > - test $0x08, %ah > - jnz L(Exit12) > - test $0x10, %ah > - jnz L(Exit13) > - test $0x20, %ah > - jnz L(Exit14) > - test $0x40, %ah > - jnz L(Exit15) > - > - .p2align 4 > -L(Exit16): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %rax > - mov %rax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 15(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $16, %r8 > - lea 16(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - > - .p2align 4 > -L(CopyFrom1To16BytesCase2): > - add $16, %r8 > - add %rsi, %rcx > - lea (%rsi, %rdx), %rsi > - lea -9(%r8), %rdx > - and $1<<7, %dh > - or %al, %dh > - test %dh, %dh > - lea (%rsi), %rdx > - jz L(ExitHighCase2) > - > - cmp $1, %r8 > - je L(Exit1) > - test $0x01, %al > - jnz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - test $0x02, %al > - jnz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - test $0x04, %al > - jnz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - test $0x08, %al > - jnz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - test $0x10, %al > - jnz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - test $0x20, %al > - jnz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - test $0x40, %al > - jnz L(Exit7) > - jmp L(Exit8) > - > - .p2align 4 > -L(ExitHighCase2): > - cmp $9, %r8 > - je L(Exit9) > - test $0x01, %ah > - jnz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - test $0x02, %ah > - jnz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - test $0x04, %ah > - jnz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - test $0x8, %ah > - jnz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - test $0x10, %ah > - jnz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - test $0x20, %ah > - jnz L(Exit14) > - cmp $15, %r8 > - je L(Exit15) > - test $0x40, %ah > - jnz L(Exit15) > - jmp L(Exit16) > - > -L(CopyFrom1To16BytesCase2OrCase3): > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - > - .p2align 4 > -L(CopyFrom1To16BytesCase3): > - add $16, %r8 > - add %rsi, %rdx > - add %rsi, %rcx > - > - cmp $16, %r8 > - je L(Exit16) > - cmp $8, %r8 > - je L(Exit8) > - jg L(More8Case3) > - cmp $4, %r8 > - je L(Exit4) > - jg L(More4Case3) > - cmp $2, %r8 > - jl L(Exit1) > - je L(Exit2) > - jg L(Exit3) > -L(More8Case3): /* but less than 16 */ > - cmp $12, %r8 > - je L(Exit12) > - jl L(Less12Case3) > - cmp $14, %r8 > - jl L(Exit13) > - je L(Exit14) > - jg L(Exit15) > -L(More4Case3): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Exit5) > - je L(Exit6) > - jg L(Exit7) > -L(Less12Case3): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Exit9) > - je L(Exit10) > - jg L(Exit11) > -# endif > - > - .p2align 4 > -L(Exit1): > - movb (%rcx), %al > - movb %al, (%rdx) > -# ifdef USE_AS_STPCPY > - lea (%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $1, %r8 > - lea 1(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit2): > - movw (%rcx), %ax > - movw %ax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 1(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $2, %r8 > - lea 2(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit3): > - movw (%rcx), %ax > - movw %ax, (%rdx) > - movb 2(%rcx), %al > - movb %al, 2(%rdx) > -# ifdef USE_AS_STPCPY > - lea 2(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $3, %r8 > - lea 3(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit4): > - movl (%rcx), %eax > - movl %eax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 3(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $4, %r8 > - lea 4(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit5): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movb 4(%rcx), %al > - movb %al, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 4(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $5, %r8 > - lea 5(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit6): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movw 4(%rcx), %ax > - movw %ax, 4(%rdx) > -# ifdef USE_AS_STPCPY > - lea 5(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $6, %r8 > - lea 6(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit7): > - movl (%rcx), %eax > - movl %eax, (%rdx) > - movl 3(%rcx), %eax > - movl %eax, 3(%rdx) > -# ifdef USE_AS_STPCPY > - lea 6(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $7, %r8 > - lea 7(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit9): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %eax > - mov %eax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 8(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $9, %r8 > - lea 9(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit10): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %eax > - mov %eax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 9(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $10, %r8 > - lea 10(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit11): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %eax > - mov %eax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 10(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $11, %r8 > - lea 11(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit12): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 8(%rcx), %eax > - mov %eax, 8(%rdx) > -# ifdef USE_AS_STPCPY > - lea 11(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $12, %r8 > - lea 12(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit13): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 5(%rcx), %rax > - mov %rax, 5(%rdx) > -# ifdef USE_AS_STPCPY > - lea 12(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $13, %r8 > - lea 13(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit14): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 6(%rcx), %rax > - mov %rax, 6(%rdx) > -# ifdef USE_AS_STPCPY > - lea 13(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $14, %r8 > - lea 14(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > - .p2align 4 > -L(Exit15): > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > -# else > - mov %rdi, %rax > -# endif > -# ifdef USE_AS_STRNCPY > - sub $15, %r8 > - lea 15(%rdx), %rcx > - jnz L(StrncpyFillTailWithZero1) > -# ifdef USE_AS_STPCPY > - cmpb $1, (%rax) > - sbb $-1, %rax > -# endif > -# endif > - ret > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(Fill0): > - ret > - > - .p2align 4 > -L(Fill1): > - movb %dl, (%rcx) > - ret > - > - .p2align 4 > -L(Fill2): > - movw %dx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill3): > - movw %dx, (%rcx) > - movb %dl, 2(%rcx) > - ret > - > - .p2align 4 > -L(Fill4): > - movl %edx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill5): > - movl %edx, (%rcx) > - movb %dl, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill6): > - movl %edx, (%rcx) > - movw %dx, 4(%rcx) > - ret > - > - .p2align 4 > -L(Fill7): > - movl %edx, (%rcx) > - movl %edx, 3(%rcx) > - ret > - > - .p2align 4 > -L(Fill8): > - mov %rdx, (%rcx) > - ret > - > - .p2align 4 > -L(Fill9): > - mov %rdx, (%rcx) > - movb %dl, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill10): > - mov %rdx, (%rcx) > - movw %dx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill11): > - mov %rdx, (%rcx) > - movl %edx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill12): > - mov %rdx, (%rcx) > - movl %edx, 8(%rcx) > - ret > - > - .p2align 4 > -L(Fill13): > - mov %rdx, (%rcx) > - mov %rdx, 5(%rcx) > - ret > - > - .p2align 4 > -L(Fill14): > - mov %rdx, (%rcx) > - mov %rdx, 6(%rcx) > - ret > - > - .p2align 4 > -L(Fill15): > - mov %rdx, (%rcx) > - mov %rdx, 7(%rcx) > - ret > - > - .p2align 4 > -L(Fill16): > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - ret > - > - .p2align 4 > -L(StrncpyFillExit1): > - lea 16(%r8), %r8 > -L(FillFrom1To16Bytes): > - test %r8, %r8 > - jz L(Fill0) > - cmp $16, %r8 > - je L(Fill16) > - cmp $8, %r8 > - je L(Fill8) > - jg L(FillMore8) > - cmp $4, %r8 > - je L(Fill4) > - jg L(FillMore4) > - cmp $2, %r8 > - jl L(Fill1) > - je L(Fill2) > - jg L(Fill3) > -L(FillMore8): /* but less than 16 */ > - cmp $12, %r8 > - je L(Fill12) > - jl L(FillLess12) > - cmp $14, %r8 > - jl L(Fill13) > - je L(Fill14) > - jg L(Fill15) > -L(FillMore4): /* but less than 8 */ > - cmp $6, %r8 > - jl L(Fill5) > - je L(Fill6) > - jg L(Fill7) > -L(FillLess12): /* but more than 8 */ > - cmp $10, %r8 > - jl L(Fill9) > - je L(Fill10) > - jmp L(Fill11) > - > - .p2align 4 > -L(StrncpyFillTailWithZero1): > - xor %rdx, %rdx > - sub $16, %r8 > - jbe L(StrncpyFillExit1) > - > - pxor %xmm0, %xmm0 > - mov %rdx, (%rcx) > - mov %rdx, 8(%rcx) > - > - lea 16(%rcx), %rcx > - > - mov %rcx, %rdx > - and $0xf, %rdx > - sub %rdx, %rcx > - add %rdx, %r8 > - xor %rdx, %rdx > - sub $64, %r8 > - jb L(StrncpyFillLess64) > - > -L(StrncpyFillLoopMovdqa): > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - movdqa %xmm0, 32(%rcx) > - movdqa %xmm0, 48(%rcx) > - lea 64(%rcx), %rcx > - sub $64, %r8 > - jae L(StrncpyFillLoopMovdqa) > - > -L(StrncpyFillLess64): > - add $32, %r8 > - jl L(StrncpyFillLess32) > - movdqa %xmm0, (%rcx) > - movdqa %xmm0, 16(%rcx) > - lea 32(%rcx), %rcx > - sub $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > -L(StrncpyFillLess32): > - add $16, %r8 > - jl L(StrncpyFillExit1) > - movdqa %xmm0, (%rcx) > - lea 16(%rcx), %rcx > - jmp L(FillFrom1To16Bytes) > - > - .p2align 4 > -L(Exit0): > - mov %rdx, %rax > - ret > - > - .p2align 4 > -L(StrncpyExit15Bytes): > - cmp $9, %r8 > - je L(Exit9) > - cmpb $0, 8(%rcx) > - jz L(Exit9) > - cmp $10, %r8 > - je L(Exit10) > - cmpb $0, 9(%rcx) > - jz L(Exit10) > - cmp $11, %r8 > - je L(Exit11) > - cmpb $0, 10(%rcx) > - jz L(Exit11) > - cmp $12, %r8 > - je L(Exit12) > - cmpb $0, 11(%rcx) > - jz L(Exit12) > - cmp $13, %r8 > - je L(Exit13) > - cmpb $0, 12(%rcx) > - jz L(Exit13) > - cmp $14, %r8 > - je L(Exit14) > - cmpb $0, 13(%rcx) > - jz L(Exit14) > - mov (%rcx), %rax > - mov %rax, (%rdx) > - mov 7(%rcx), %rax > - mov %rax, 7(%rdx) > -# ifdef USE_AS_STPCPY > - lea 14(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > - .p2align 4 > -L(StrncpyExit8Bytes): > - cmp $1, %r8 > - je L(Exit1) > - cmpb $0, (%rcx) > - jz L(Exit1) > - cmp $2, %r8 > - je L(Exit2) > - cmpb $0, 1(%rcx) > - jz L(Exit2) > - cmp $3, %r8 > - je L(Exit3) > - cmpb $0, 2(%rcx) > - jz L(Exit3) > - cmp $4, %r8 > - je L(Exit4) > - cmpb $0, 3(%rcx) > - jz L(Exit4) > - cmp $5, %r8 > - je L(Exit5) > - cmpb $0, 4(%rcx) > - jz L(Exit5) > - cmp $6, %r8 > - je L(Exit6) > - cmpb $0, 5(%rcx) > - jz L(Exit6) > - cmp $7, %r8 > - je L(Exit7) > - cmpb $0, 6(%rcx) > - jz L(Exit7) > - mov (%rcx), %rax > - mov %rax, (%rdx) > -# ifdef USE_AS_STPCPY > - lea 7(%rdx), %rax > - cmpb $1, (%rax) > - sbb $-1, %rax > -# else > - mov %rdi, %rax > -# endif > - ret > - > -# endif > -# endif > - > -# ifdef USE_AS_STRNCPY > - .p2align 4 > -L(StrncpyLeaveCase2OrCase3): > - test %rax, %rax > - jnz L(Aligned64LeaveCase2) > - > -L(Aligned64LeaveCase3): > - lea 64(%r8), %r8 > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase3) > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase3) > - > -L(Aligned64LeaveCase2): > - pcmpeqb %xmm4, %xmm0 > - pmovmskb %xmm0, %rax > - add $48, %r8 > - jle L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm5, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm4, -64(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm6, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm5, -48(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(CopyFrom1To16BytesCase2OrCase3) > - test %rax, %rax > - jnz L(CopyFrom1To16Bytes) > - > - pcmpeqb %xmm7, %xmm0 > - pmovmskb %xmm0, %rax > - movaps %xmm6, -32(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - jmp L(CopyFrom1To16BytesCase2) > -/*--------------------------------------------------*/ > - .p2align 4 > -L(StrncpyExit1Case2OrCase3): > - movdqu -1(%rcx), %xmm0 > - movdqu %xmm0, -1(%rdx) > - mov $15, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit2Case2OrCase3): > - movdqu -2(%rcx), %xmm0 > - movdqu %xmm0, -2(%rdx) > - mov $14, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit3Case2OrCase3): > - movdqu -3(%rcx), %xmm0 > - movdqu %xmm0, -3(%rdx) > - mov $13, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit4Case2OrCase3): > - movdqu -4(%rcx), %xmm0 > - movdqu %xmm0, -4(%rdx) > - mov $12, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit5Case2OrCase3): > - movdqu -5(%rcx), %xmm0 > - movdqu %xmm0, -5(%rdx) > - mov $11, %rsi > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit6Case2OrCase3): > - mov (%rcx), %rsi > - mov 6(%rcx), %r9d > - mov %r9d, 6(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $10, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit7Case2OrCase3): > - mov (%rcx), %rsi > - mov 5(%rcx), %r9d > - mov %r9d, 5(%rdx) > - mov %rsi, (%rdx) > - test %rax, %rax > - mov $9, %rsi > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit8Case2OrCase3): > - mov (%rcx), %r9 > - mov $8, %rsi > - mov %r9, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit9Case2OrCase3): > - mov -1(%rcx), %r9 > - mov $7, %rsi > - mov %r9, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit10Case2OrCase3): > - mov -2(%rcx), %r9 > - mov $6, %rsi > - mov %r9, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit11Case2OrCase3): > - mov -3(%rcx), %r9 > - mov $5, %rsi > - mov %r9, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit12Case2OrCase3): > - mov (%rcx), %r9d > - mov $4, %rsi > - mov %r9d, (%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit13Case2OrCase3): > - mov -1(%rcx), %r9d > - mov $3, %rsi > - mov %r9d, -1(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit14Case2OrCase3): > - mov -2(%rcx), %r9d > - mov $2, %rsi > - mov %r9d, -2(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyExit15Case2OrCase3): > - mov -3(%rcx), %r9d > - mov $1, %rsi > - mov %r9d, -3(%rdx) > - test %rax, %rax > - jnz L(CopyFrom1To16BytesCase2) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave1): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit1) > - palignr $1, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 31(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - palignr $1, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit1) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit1): > - lea 15(%rdx, %rsi), %rdx > - lea 15(%rcx, %rsi), %rcx > - mov -15(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -15(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave2): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit2) > - palignr $2, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 30(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - palignr $2, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit2) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit2): > - lea 14(%rdx, %rsi), %rdx > - lea 14(%rcx, %rsi), %rcx > - mov -14(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -14(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave3): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit3) > - palignr $3, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 29(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - palignr $3, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit3) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit3): > - lea 13(%rdx, %rsi), %rdx > - lea 13(%rcx, %rsi), %rcx > - mov -13(%rcx), %rsi > - mov -8(%rcx), %rax > - mov %rsi, -13(%rdx) > - mov %rax, -8(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave4): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit4) > - palignr $4, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 28(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - palignr $4, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit4) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit4): > - lea 12(%rdx, %rsi), %rdx > - lea 12(%rcx, %rsi), %rcx > - mov -12(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -12(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave5): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit5) > - palignr $5, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 27(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - palignr $5, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit5) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit5): > - lea 11(%rdx, %rsi), %rdx > - lea 11(%rcx, %rsi), %rcx > - mov -11(%rcx), %rsi > - mov -4(%rcx), %eax > - mov %rsi, -11(%rdx) > - mov %eax, -4(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave6): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit6) > - palignr $6, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 26(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - palignr $6, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit6) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit6): > - lea 10(%rdx, %rsi), %rdx > - lea 10(%rcx, %rsi), %rcx > - mov -10(%rcx), %rsi > - movw -2(%rcx), %ax > - mov %rsi, -10(%rdx) > - movw %ax, -2(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave7): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit7) > - palignr $7, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 25(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - palignr $7, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit7) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit7): > - lea 9(%rdx, %rsi), %rdx > - lea 9(%rcx, %rsi), %rcx > - mov -9(%rcx), %rsi > - movb -1(%rcx), %ah > - mov %rsi, -9(%rdx) > - movb %ah, -1(%rdx) > - xor %rsi, %rsi > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave8): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit8) > - palignr $8, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 24(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - palignr $8, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit8) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit8): > - lea 8(%rdx, %rsi), %rdx > - lea 8(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave9): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit9) > - palignr $9, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 23(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - palignr $9, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit9) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit9): > - lea 7(%rdx, %rsi), %rdx > - lea 7(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave10): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit10) > - palignr $10, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 22(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - palignr $10, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit10) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit10): > - lea 6(%rdx, %rsi), %rdx > - lea 6(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave11): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit11) > - palignr $11, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 21(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - palignr $11, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit11) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit11): > - lea 5(%rdx, %rsi), %rdx > - lea 5(%rcx, %rsi), %rcx > - mov -8(%rcx), %rax > - xor %rsi, %rsi > - mov %rax, -8(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave12): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit12) > - palignr $12, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 20(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - palignr $12, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit12) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit12): > - lea 4(%rdx, %rsi), %rdx > - lea 4(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave13): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit13) > - palignr $13, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 19(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - palignr $13, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit13) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit13): > - lea 3(%rdx, %rsi), %rdx > - lea 3(%rcx, %rsi), %rcx > - mov -4(%rcx), %eax > - xor %rsi, %rsi > - mov %eax, -4(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave14): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit14) > - palignr $14, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 18(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - palignr $14, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit14) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit14): > - lea 2(%rdx, %rsi), %rdx > - lea 2(%rcx, %rsi), %rcx > - movw -2(%rcx), %ax > - xor %rsi, %rsi > - movw %ax, -2(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > - .p2align 4 > -L(StrncpyLeave15): > - movaps %xmm2, %xmm3 > - add $48, %r8 > - jle L(StrncpyExit15) > - palignr $15, %xmm1, %xmm2 > - movaps %xmm2, (%rdx) > - movaps 17(%rcx), %xmm2 > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - palignr $15, %xmm3, %xmm2 > - movaps %xmm2, 16(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm4, 32(%rdx) > - lea 16(%rsi), %rsi > - sub $16, %r8 > - jbe L(StrncpyExit15) > - movaps %xmm5, 48(%rdx) > - lea 16(%rsi), %rsi > - lea -16(%r8), %r8 > - > -L(StrncpyExit15): > - lea 1(%rdx, %rsi), %rdx > - lea 1(%rcx, %rsi), %rcx > - movb -1(%rcx), %ah > - xor %rsi, %rsi > - movb %ah, -1(%rdx) > - jmp L(CopyFrom1To16BytesCase3) > - > -# endif > -# ifndef USE_AS_STRCAT > -END (STRCPY) > -# endif > -#endif > diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S > deleted file mode 100644 > index bf82ee447d..0000000000 > --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S > +++ /dev/null > @@ -1,3 +0,0 @@ > -#define USE_AS_STRNCPY > -#define STRCPY __strncpy_ssse3 > -#include "strcpy-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (4 preceding siblings ...) 2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein @ 2022-03-25 19:54 ` H.J. Lu 2022-03-25 20:34 ` Andreas Schwab 6 siblings, 0 replies; 49+ messages in thread From: H.J. Lu @ 2022-03-25 19:54 UTC (permalink / raw) To: Noah Goldstein; +Cc: libc-alpha, carlos On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote: > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. > --- > sysdeps/x86_64/multiarch/Makefile | 2 - > sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - > sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - > sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 -------------------- > sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 - > 5 files changed, 2006 deletions(-) > delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S > delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > > diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile > index 6507d1b7fa..51222dfab1 100644 > --- a/sysdeps/x86_64/multiarch/Makefile > +++ b/sysdeps/x86_64/multiarch/Makefile > @@ -12,7 +12,6 @@ sysdep_routines += \ > memcmp-evex-movbe \ > memcmp-sse2 \ > memcmp-sse4 \ > - memcmp-ssse3 \ > memcmpeq-avx2 \ > memcmpeq-avx2-rtm \ > memcmpeq-evex \ > @@ -179,7 +178,6 @@ sysdep_routines += \ > wmemcmp-c \ > wmemcmp-evex-movbe \ > wmemcmp-sse4 \ > - wmemcmp-ssse3 \ > # sysdep_routines > endif > > diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > index 40cc6cc49e..f389928a4e 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c > +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c > @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __memcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), > __memcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), > - __memcmp_ssse3) > IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) > > #ifdef SHARED > @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, > __wmemcmp_evex_movbe) > IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), > __wmemcmp_sse4_1) > - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), > - __wmemcmp_ssse3) > IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) > > /* Support sysdeps/x86_64/multiarch/wmemset.c. */ > diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > index cd12613699..44759a3ad5 100644 > --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h > +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h > @@ -20,7 +20,6 @@ > # include <init-arch.h> > > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; > -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; > extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; > @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void) > if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) > return OPTIMIZE (sse4_1); > > - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) > - return OPTIMIZE (ssse3); > - > return OPTIMIZE (sse2); > } > diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S > deleted file mode 100644 > index df1b1fc494..0000000000 > --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S > +++ /dev/null > @@ -1,1992 +0,0 @@ > -/* memcmp with SSSE3, wmemcmp with SSSE3 > - Copyright (C) 2011-2022 Free Software Foundation, Inc. > - This file is part of the GNU C Library. > - > - The GNU C Library is free software; you can redistribute it and/or > - modify it under the terms of the GNU Lesser General Public > - License as published by the Free Software Foundation; either > - version 2.1 of the License, or (at your option) any later version. > - > - The GNU C Library is distributed in the hope that it will be useful, > - but WITHOUT ANY WARRANTY; without even the implied warranty of > - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU > - Lesser General Public License for more details. > - > - You should have received a copy of the GNU Lesser General Public > - License along with the GNU C Library; if not, see > - <https://www.gnu.org/licenses/>. */ > - > -#if IS_IN (libc) > - > -# include <sysdep.h> > - > -# ifndef MEMCMP > -# define MEMCMP __memcmp_ssse3 > -# endif > - > -/* Warning! > - wmemcmp has to use SIGNED comparison for elements. > - memcmp has to use UNSIGNED comparison for elemnts. > -*/ > - > - atom_text_section > -ENTRY (MEMCMP) > -# ifdef USE_AS_WMEMCMP > - shl $2, %RDX_LP > - test %RDX_LP, %RDX_LP > - jz L(equal) > -# elif defined __ILP32__ > - /* Clear the upper 32 bits. */ > - mov %edx, %edx > -# endif > - mov %rdx, %rcx > - mov %rdi, %rdx > - cmp $48, %rcx; > - jae L(48bytesormore) /* LEN => 48 */ > - > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -/* ECX >= 32. */ > -L(48bytesormore): > - movdqu (%rdi), %xmm3 > - movdqu (%rsi), %xmm0 > - pcmpeqb %xmm0, %xmm3 > - pmovmskb %xmm3, %edx > - lea 16(%rdi), %rdi > - lea 16(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(less16bytes) > - mov %edi, %edx > - and $0xf, %edx > - xor %rdx, %rdi > - sub %rdx, %rsi > - add %rdx, %rcx > - mov %esi, %edx > - and $0xf, %edx > - jz L(shr_0) > - xor %rdx, %rsi > - > -# ifndef USE_AS_WMEMCMP > - cmp $8, %edx > - jae L(next_unaligned_table) > - cmp $0, %edx > - je L(shr_0) > - cmp $1, %edx > - je L(shr_1) > - cmp $2, %edx > - je L(shr_2) > - cmp $3, %edx > - je L(shr_3) > - cmp $4, %edx > - je L(shr_4) > - cmp $5, %edx > - je L(shr_5) > - cmp $6, %edx > - je L(shr_6) > - jmp L(shr_7) > - > - .p2align 2 > -L(next_unaligned_table): > - cmp $8, %edx > - je L(shr_8) > - cmp $9, %edx > - je L(shr_9) > - cmp $10, %edx > - je L(shr_10) > - cmp $11, %edx > - je L(shr_11) > - cmp $12, %edx > - je L(shr_12) > - cmp $13, %edx > - je L(shr_13) > - cmp $14, %edx > - je L(shr_14) > - jmp L(shr_15) > -# else > - cmp $0, %edx > - je L(shr_0) > - cmp $4, %edx > - je L(shr_4) > - cmp $8, %edx > - je L(shr_8) > - jmp L(shr_12) > -# endif > - > - .p2align 4 > -L(shr_0): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - jae L(shr_0_gobble) > - xor %eax, %eax > - movdqa (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > - pand %xmm1, %xmm2 > - pmovmskb %xmm2, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_0_gobble): > - movdqa (%rsi), %xmm0 > - xor %eax, %eax > - pcmpeqb (%rdi), %xmm0 > - sub $32, %rcx > - movdqa 16(%rsi), %xmm2 > - pcmpeqb 16(%rdi), %xmm2 > -L(shr_0_gobble_loop): > - pand %xmm0, %xmm2 > - sub $32, %rcx > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - movdqa 32(%rsi), %xmm0 > - movdqa 48(%rsi), %xmm2 > - sbb $0xffff, %edx > - pcmpeqb 32(%rdi), %xmm0 > - pcmpeqb 48(%rdi), %xmm2 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - jz L(shr_0_gobble_loop) > - > - pand %xmm0, %xmm2 > - cmp $0, %rcx > - jge L(next) > - inc %edx > - add $32, %rcx > -L(next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm2, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_1): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_1_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $1, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $1, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_1_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $1, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $1, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_1_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $1, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $1, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_1_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_1_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_1_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 1(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - > - .p2align 4 > -L(shr_2): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_2_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $2, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $2, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_2_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $2, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $2, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_2_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $2, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $2, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_2_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_2_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_2_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 2(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_3_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $3, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $3, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_3_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $3, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $3, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_3_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $3, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $3, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_3_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_3_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_3_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 3(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_4): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_4_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $4, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $4, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_4_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $4, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $4, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_4_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $4, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $4, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_4_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_4_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_4_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 4(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_5): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_5_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $5, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $5, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_5_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $5, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $5, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_5_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $5, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $5, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_5_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_5_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_5_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 5(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_6_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $6, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $6, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_6_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $6, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $6, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_6_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $6, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $6, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_6_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_6_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_6_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 6(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_7_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $7, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $7, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_7_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $7, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $7, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_7_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $7, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $7, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_7_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_7_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_7_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 7(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_8): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_8_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $8, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $8, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_8_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $8, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $8, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_8_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $8, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $8, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_8_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_8_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_8_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 8(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_9): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_9_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $9, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $9, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_9_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $9, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $9, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_9_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $9, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $9, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_9_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_9_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_9_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 9(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_10_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $10, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $10, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_10_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $10, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $10, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_10_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $10, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $10, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_10_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_10_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_10_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 10(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_11_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $11, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $11, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_11_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $11, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $11, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_11_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $11, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $11, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_11_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_11_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_11_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 11(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# endif > - > - .p2align 4 > -L(shr_12): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_12_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $12, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $12, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_12_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $12, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $12, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_12_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $12, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $12, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_12_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_12_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_12_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 12(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > -# ifndef USE_AS_WMEMCMP > - > - .p2align 4 > -L(shr_13): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_13_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $13, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $13, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_13_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $13, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $13, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_13_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $13, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $13, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_13_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_13_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_13_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 13(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_14_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $14, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $14, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_14_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $14, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $14, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_14_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $14, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $14, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_14_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_14_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_14_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 14(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15): > - cmp $80, %rcx > - lea -48(%rcx), %rcx > - mov %edx, %eax > - jae L(shr_15_gobble) > - > - movdqa 16(%rsi), %xmm1 > - movdqa %xmm1, %xmm2 > - palignr $15, (%rsi), %xmm1 > - pcmpeqb (%rdi), %xmm1 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, %xmm2, %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > - pand %xmm1, %xmm3 > - pmovmskb %xmm3, %edx > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - add $15, %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > - > - .p2align 4 > -L(shr_15_gobble): > - sub $32, %rcx > - movdqa 16(%rsi), %xmm0 > - palignr $15, (%rsi), %xmm0 > - pcmpeqb (%rdi), %xmm0 > - > - movdqa 32(%rsi), %xmm3 > - palignr $15, 16(%rsi), %xmm3 > - pcmpeqb 16(%rdi), %xmm3 > - > -L(shr_15_gobble_loop): > - pand %xmm0, %xmm3 > - sub $32, %rcx > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - > - movdqa 64(%rsi), %xmm3 > - palignr $15, 48(%rsi), %xmm3 > - sbb $0xffff, %edx > - movdqa 48(%rsi), %xmm0 > - palignr $15, 32(%rsi), %xmm0 > - pcmpeqb 32(%rdi), %xmm0 > - lea 32(%rsi), %rsi > - pcmpeqb 48(%rdi), %xmm3 > - > - lea 32(%rdi), %rdi > - jz L(shr_15_gobble_loop) > - pand %xmm0, %xmm3 > - > - cmp $0, %rcx > - jge L(shr_15_gobble_next) > - inc %edx > - add $32, %rcx > -L(shr_15_gobble_next): > - test %edx, %edx > - jnz L(exit) > - > - pmovmskb %xmm3, %edx > - movdqa %xmm0, %xmm1 > - lea 32(%rdi), %rdi > - lea 32(%rsi), %rsi > - sub $0xffff, %edx > - jnz L(exit) > - > - lea 15(%rsi), %rsi > - add %rcx, %rsi > - add %rcx, %rdi > - jmp L(less48bytes) > -# endif > - .p2align 4 > -L(exit): > - pmovmskb %xmm1, %r8d > - sub $0xffff, %r8d > - jz L(first16bytes) > - lea -16(%rsi), %rsi > - lea -16(%rdi), %rdi > - mov %r8d, %edx > -L(first16bytes): > - add %rax, %rsi > -L(less16bytes): > -# ifndef USE_AS_WMEMCMP > - test %dl, %dl > - jz L(next_24_bytes) > - > - test $0x01, %dl > - jnz L(Byte16) > - > - test $0x02, %dl > - jnz L(Byte17) > - > - test $0x04, %dl > - jnz L(Byte18) > - > - test $0x08, %dl > - jnz L(Byte19) > - > - test $0x10, %dl > - jnz L(Byte20) > - > - test $0x20, %dl > - jnz L(Byte21) > - > - test $0x40, %dl > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte16): > - movzbl -16(%rdi), %eax > - movzbl -16(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte17): > - movzbl -15(%rdi), %eax > - movzbl -15(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte18): > - movzbl -14(%rdi), %eax > - movzbl -14(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte19): > - movzbl -13(%rdi), %eax > - movzbl -13(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte20): > - movzbl -12(%rdi), %eax > - movzbl -12(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte21): > - movzbl -11(%rdi), %eax > - movzbl -11(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(Byte22): > - movzbl -10(%rdi), %eax > - movzbl -10(%rsi), %edx > - sub %edx, %eax > - ret > - > - .p2align 4 > -L(next_24_bytes): > - lea 8(%rdi), %rdi > - lea 8(%rsi), %rsi > - test $0x01, %dh > - jnz L(Byte16) > - > - test $0x02, %dh > - jnz L(Byte17) > - > - test $0x04, %dh > - jnz L(Byte18) > - > - test $0x08, %dh > - jnz L(Byte19) > - > - test $0x10, %dh > - jnz L(Byte20) > - > - test $0x20, %dh > - jnz L(Byte21) > - > - test $0x40, %dh > - jnz L(Byte22) > - > - movzbl -9(%rdi), %eax > - movzbl -9(%rsi), %edx > - sub %edx, %eax > - ret > -# else > -/* special for wmemcmp */ > - xor %eax, %eax > - test %dl, %dl > - jz L(next_two_double_words) > - and $15, %dl > - jz L(second_double_word) > - mov -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(second_double_word): > - mov -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(next_two_double_words): > - and $15, %dh > - jz L(fourth_double_word) > - mov -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > - ret > - > - .p2align 4 > -L(fourth_double_word): > - mov -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > - ret > -# endif > - > - .p2align 4 > -L(less48bytes): > - cmp $8, %ecx > - jae L(more8bytes) > - cmp $0, %ecx > - je L(0bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $1, %ecx > - je L(1bytes) > - cmp $2, %ecx > - je L(2bytes) > - cmp $3, %ecx > - je L(3bytes) > - cmp $4, %ecx > - je L(4bytes) > - cmp $5, %ecx > - je L(5bytes) > - cmp $6, %ecx > - je L(6bytes) > - jmp L(7bytes) > -# else > - jmp L(4bytes) > -# endif > - > - .p2align 4 > -L(more8bytes): > - cmp $16, %ecx > - jae L(more16bytes) > - cmp $8, %ecx > - je L(8bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $9, %ecx > - je L(9bytes) > - cmp $10, %ecx > - je L(10bytes) > - cmp $11, %ecx > - je L(11bytes) > - cmp $12, %ecx > - je L(12bytes) > - cmp $13, %ecx > - je L(13bytes) > - cmp $14, %ecx > - je L(14bytes) > - jmp L(15bytes) > -# else > - jmp L(12bytes) > -# endif > - > - .p2align 4 > -L(more16bytes): > - cmp $24, %ecx > - jae L(more24bytes) > - cmp $16, %ecx > - je L(16bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $17, %ecx > - je L(17bytes) > - cmp $18, %ecx > - je L(18bytes) > - cmp $19, %ecx > - je L(19bytes) > - cmp $20, %ecx > - je L(20bytes) > - cmp $21, %ecx > - je L(21bytes) > - cmp $22, %ecx > - je L(22bytes) > - jmp L(23bytes) > -# else > - jmp L(20bytes) > -# endif > - > - .p2align 4 > -L(more24bytes): > - cmp $32, %ecx > - jae L(more32bytes) > - cmp $24, %ecx > - je L(24bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $25, %ecx > - je L(25bytes) > - cmp $26, %ecx > - je L(26bytes) > - cmp $27, %ecx > - je L(27bytes) > - cmp $28, %ecx > - je L(28bytes) > - cmp $29, %ecx > - je L(29bytes) > - cmp $30, %ecx > - je L(30bytes) > - jmp L(31bytes) > -# else > - jmp L(28bytes) > -# endif > - > - .p2align 4 > -L(more32bytes): > - cmp $40, %ecx > - jae L(more40bytes) > - cmp $32, %ecx > - je L(32bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $33, %ecx > - je L(33bytes) > - cmp $34, %ecx > - je L(34bytes) > - cmp $35, %ecx > - je L(35bytes) > - cmp $36, %ecx > - je L(36bytes) > - cmp $37, %ecx > - je L(37bytes) > - cmp $38, %ecx > - je L(38bytes) > - jmp L(39bytes) > -# else > - jmp L(36bytes) > -# endif > - > - .p2align 4 > -L(more40bytes): > - cmp $40, %ecx > - je L(40bytes) > -# ifndef USE_AS_WMEMCMP > - cmp $41, %ecx > - je L(41bytes) > - cmp $42, %ecx > - je L(42bytes) > - cmp $43, %ecx > - je L(43bytes) > - cmp $44, %ecx > - je L(44bytes) > - cmp $45, %ecx > - je L(45bytes) > - cmp $46, %ecx > - je L(46bytes) > - jmp L(47bytes) > - > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - movl -44(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - movl -40(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - movl -36(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - movl -32(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - movl -28(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - movl -24(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - movl -20(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - movl -16(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - movl -12(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - movl -8(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - movl -4(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# else > - .p2align 4 > -L(44bytes): > - movl -44(%rdi), %eax > - cmp -44(%rsi), %eax > - jne L(find_diff) > -L(40bytes): > - movl -40(%rdi), %eax > - cmp -40(%rsi), %eax > - jne L(find_diff) > -L(36bytes): > - movl -36(%rdi), %eax > - cmp -36(%rsi), %eax > - jne L(find_diff) > -L(32bytes): > - movl -32(%rdi), %eax > - cmp -32(%rsi), %eax > - jne L(find_diff) > -L(28bytes): > - movl -28(%rdi), %eax > - cmp -28(%rsi), %eax > - jne L(find_diff) > -L(24bytes): > - movl -24(%rdi), %eax > - cmp -24(%rsi), %eax > - jne L(find_diff) > -L(20bytes): > - movl -20(%rdi), %eax > - cmp -20(%rsi), %eax > - jne L(find_diff) > -L(16bytes): > - movl -16(%rdi), %eax > - cmp -16(%rsi), %eax > - jne L(find_diff) > -L(12bytes): > - movl -12(%rdi), %eax > - cmp -12(%rsi), %eax > - jne L(find_diff) > -L(8bytes): > - movl -8(%rdi), %eax > - cmp -8(%rsi), %eax > - jne L(find_diff) > -L(4bytes): > - movl -4(%rdi), %eax > - cmp -4(%rsi), %eax > - jne L(find_diff) > -L(0bytes): > - xor %eax, %eax > - ret > -# endif > - > -# ifndef USE_AS_WMEMCMP > - .p2align 4 > -L(45bytes): > - movl -45(%rdi), %eax > - movl -45(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(41bytes): > - movl -41(%rdi), %eax > - movl -41(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(37bytes): > - movl -37(%rdi), %eax > - movl -37(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(33bytes): > - movl -33(%rdi), %eax > - movl -33(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(29bytes): > - movl -29(%rdi), %eax > - movl -29(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(25bytes): > - movl -25(%rdi), %eax > - movl -25(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(21bytes): > - movl -21(%rdi), %eax > - movl -21(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(17bytes): > - movl -17(%rdi), %eax > - movl -17(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(13bytes): > - movl -13(%rdi), %eax > - movl -13(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(9bytes): > - movl -9(%rdi), %eax > - movl -9(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(5bytes): > - movl -5(%rdi), %eax > - movl -5(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(1bytes): > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(46bytes): > - movl -46(%rdi), %eax > - movl -46(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(42bytes): > - movl -42(%rdi), %eax > - movl -42(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(38bytes): > - movl -38(%rdi), %eax > - movl -38(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(34bytes): > - movl -34(%rdi), %eax > - movl -34(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(30bytes): > - movl -30(%rdi), %eax > - movl -30(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(26bytes): > - movl -26(%rdi), %eax > - movl -26(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(22bytes): > - movl -22(%rdi), %eax > - movl -22(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(18bytes): > - movl -18(%rdi), %eax > - movl -18(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(14bytes): > - movl -14(%rdi), %eax > - movl -14(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(10bytes): > - movl -10(%rdi), %eax > - movl -10(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(6bytes): > - movl -6(%rdi), %eax > - movl -6(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(2bytes): > - movzwl -2(%rdi), %eax > - movzwl -2(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(47bytes): > - movl -47(%rdi), %eax > - movl -47(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(43bytes): > - movl -43(%rdi), %eax > - movl -43(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(39bytes): > - movl -39(%rdi), %eax > - movl -39(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(35bytes): > - movl -35(%rdi), %eax > - movl -35(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(31bytes): > - movl -31(%rdi), %eax > - movl -31(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(27bytes): > - movl -27(%rdi), %eax > - movl -27(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(23bytes): > - movl -23(%rdi), %eax > - movl -23(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(19bytes): > - movl -19(%rdi), %eax > - movl -19(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(15bytes): > - movl -15(%rdi), %eax > - movl -15(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(11bytes): > - movl -11(%rdi), %eax > - movl -11(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(7bytes): > - movl -7(%rdi), %eax > - movl -7(%rsi), %ecx > - cmp %ecx, %eax > - jne L(find_diff) > -L(3bytes): > - movzwl -3(%rdi), %eax > - movzwl -3(%rsi), %ecx > - cmpb %cl, %al > - jne L(set) > - cmp %ecx, %eax > - jne L(set) > - movzbl -1(%rdi), %eax > - cmpb -1(%rsi), %al > - jne L(set) > - xor %eax, %eax > - ret > - > - .p2align 4 > -L(find_diff): > - cmpb %cl, %al > - jne L(set) > - cmpw %cx, %ax > - jne L(set) > - shr $16, %eax > - shr $16, %ecx > - cmpb %cl, %al > - jne L(set) > - > -/* We get there only if we already know there is a > -difference. */ > - > - cmp %ecx, %eax > -L(set): > - sbb %eax, %eax > - sbb $-1, %eax > - ret > -# else > - > -/* for wmemcmp */ > - .p2align 4 > -L(find_diff): > - mov $1, %eax > - jg L(find_diff_bigger) > - neg %eax > - ret > - > - .p2align 4 > -L(find_diff_bigger): > - ret > -# endif > - > - .p2align 4 > -L(equal): > - xor %eax, %eax > - ret > - > -END (MEMCMP) > -#endif > diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > deleted file mode 100644 > index a41ef95fc1..0000000000 > --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S > +++ /dev/null > @@ -1,4 +0,0 @@ > -#define USE_AS_WMEMCMP 1 > -#define MEMCMP __wmemcmp_ssse3 > - > -#include "memcmp-ssse3.S" > -- > 2.25.1 > LGTM. Reviewed-by: H.J. Lu <hjl.tools@gmail.com> Thanks. -- H.J. ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein ` (5 preceding siblings ...) 2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu @ 2022-03-25 20:34 ` Andreas Schwab 2022-03-25 20:40 ` Noah Goldstein 6 siblings, 1 reply; 49+ messages in thread From: Andreas Schwab @ 2022-03-25 20:34 UTC (permalink / raw) To: Noah Goldstein via Libc-alpha On Mär 25 2022, Noah Goldstein via Libc-alpha wrote: > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > SSSE3. As a result its no longer with the code size cost. I think the second sentence is missing something. Also: s/its/it is/. -- Andreas Schwab, schwab@linux-m68k.org GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1 "And now for something completely different." ^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 2022-03-25 20:34 ` Andreas Schwab @ 2022-03-25 20:40 ` Noah Goldstein 0 siblings, 0 replies; 49+ messages in thread From: Noah Goldstein @ 2022-03-25 20:40 UTC (permalink / raw) To: Andreas Schwab; +Cc: Noah Goldstein via Libc-alpha On Fri, Mar 25, 2022 at 3:34 PM Andreas Schwab <schwab@linux-m68k.org> wrote: > > On Mär 25 2022, Noah Goldstein via Libc-alpha wrote: > > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer > > SSSE3. As a result its no longer with the code size cost. > > I think the second sentence is missing something. Also: s/its/it is/. ^ Hows: "As a result it is no longer worth it to keep the SSSE3 versions given the code size cost." > > -- > Andreas Schwab, schwab@linux-m68k.org > GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1 > "And now for something completely different." ^ permalink raw reply [flat|nested] 49+ messages in thread
end of thread, other threads:[~2022-04-14 18:13 UTC | newest] Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 19:55 ` H.J. Lu 2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein 2022-04-10 0:57 ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein 2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein 2022-04-10 0:48 ` Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein 2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein 2022-04-14 18:05 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein 2022-04-14 18:06 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 2022-04-14 18:10 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 2022-04-14 18:13 ` H.J. Lu 2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein 2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu 2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein 2022-03-25 19:56 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein 2022-03-25 19:56 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein 2022-03-25 19:57 ` H.J. Lu 2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein 2022-03-25 19:57 ` H.J. Lu 2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu 2022-03-25 20:34 ` Andreas Schwab 2022-03-25 20:40 ` Noah Goldstein
This is a public inbox, see mirroring instructions for how to clone and mirror all data and code used for this inbox; as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).