* [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
@ 2022-03-25 18:36 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (6 more replies)
0 siblings, 7 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
5 files changed, 2006 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
memcmp-evex-movbe \
memcmp-sse2 \
memcmp-sse4 \
- memcmp-ssse3 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
wmemcmp-c \
wmemcmp-evex-movbe \
wmemcmp-sse4 \
- wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
- __memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
- __wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
- test %RDX_LP, %RDX_LP
- jz L(equal)
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
2022-03-25 19:55 ` H.J. Lu
` (9 more replies)
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
` (5 subsequent siblings)
6 siblings, 10 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
sysdeps/x86_64/multiarch/strcmp.c | 4 -
sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
sysdeps/x86_64/multiarch/strncmp.c | 4 -
sysdeps/x86_64/strcmp.S | 155 ++++--------------
10 files changed, 30 insertions(+), 202 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
- strcasecmp_l-ssse3 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
- strcmp-ssse3 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
- strncase_l-ssse3 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
- strncmp-ssse3 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
__strcasecmp_l_sse2))
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strcmp_evex)
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
__strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
- __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
__strncasecmp_sse2))
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
__strncasecmp_l_sse2))
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncmp_evex)
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
__strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
- __strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
# endif
#endif
-#ifndef USE_SSSE3
.text
-#else
- .section .text.ssse3,"ax",@progbits
-#endif
-
#ifdef USE_AS_STRCASECMP_L
# ifndef ENTRY2
# define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
` (4 subsequent siblings)
6 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +-
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 --------------------
sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 -
5 files changed, 7 insertions(+), 3183 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..48f81711ae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
@@ -24,7 +23,6 @@ sysdep_routines += \
memmove-avx512-unaligned-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
- memmove-ssse3 \
memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..70b0e9c62e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned)
@@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
@@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..1ecdd4b0d3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
+ return OPTIMIZE (ssse3_back);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
` (3 subsequent siblings)
6 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 -
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
5 files changed, 3209 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48f81711ae..323be3b969 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,14 +16,12 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 70b0e9c62e..d6852ab365 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned)
@@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1ecdd4b0d3..5596ddea2c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
- {
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
- }
-
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (sse2_unaligned_erms);
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 5/6] x86: Remove str{n}cat-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (2 preceding siblings ...)
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
2022-03-25 19:57 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
` (2 subsequent siblings)
6 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
5 files changed, 879 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 323be3b969..a2ebc06c5f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -59,7 +59,6 @@ sysdep_routines += \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
- strcat-ssse3 \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
@@ -97,7 +96,6 @@ sysdep_routines += \
strncat-c \
strncat-evex \
strncat-sse2-unaligned \
- strncat-ssse3 \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d6852ab365..4133ed7e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcat_evex)
- IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
- __strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
@@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncat_evex)
- IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
- __strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (3 preceding siblings ...)
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
2022-03-25 19:57 ` H.J. Lu
2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 20:34 ` Andreas Schwab
6 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
6 files changed, 3572 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -79,7 +77,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -106,7 +103,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (4 preceding siblings ...)
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-03-25 19:54 ` H.J. Lu
2022-03-25 20:34 ` Andreas Schwab
6 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:54 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
> sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
> sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
> sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
> 5 files changed, 2006 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 6507d1b7fa..51222dfab1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -12,7 +12,6 @@ sysdep_routines += \
> memcmp-evex-movbe \
> memcmp-sse2 \
> memcmp-sse4 \
> - memcmp-ssse3 \
> memcmpeq-avx2 \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> @@ -179,7 +178,6 @@ sysdep_routines += \
> wmemcmp-c \
> wmemcmp-evex-movbe \
> wmemcmp-sse4 \
> - wmemcmp-ssse3 \
> # sysdep_routines
> endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 40cc6cc49e..f389928a4e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> __memcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
> - __memcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> #ifdef SHARED
> @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __wmemcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> __wmemcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
> - __wmemcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/wmemset.c. */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index cd12613699..44759a3ad5 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> return OPTIMIZE (sse4_1);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> deleted file mode 100644
> index df1b1fc494..0000000000
> --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> +++ /dev/null
> @@ -1,1992 +0,0 @@
> -/* memcmp with SSSE3, wmemcmp with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -# define MEMCMP __memcmp_ssse3
> -# endif
> -
> -/* Warning!
> - wmemcmp has to use SIGNED comparison for elements.
> - memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> - atom_text_section
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> - shl $2, %RDX_LP
> - test %RDX_LP, %RDX_LP
> - jz L(equal)
> -# elif defined __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -# endif
> - mov %rdx, %rcx
> - mov %rdi, %rdx
> - cmp $48, %rcx;
> - jae L(48bytesormore) /* LEN => 48 */
> -
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -/* ECX >= 32. */
> -L(48bytesormore):
> - movdqu (%rdi), %xmm3
> - movdqu (%rsi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 16(%rdi), %rdi
> - lea 16(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(less16bytes)
> - mov %edi, %edx
> - and $0xf, %edx
> - xor %rdx, %rdi
> - sub %rdx, %rsi
> - add %rdx, %rcx
> - mov %esi, %edx
> - and $0xf, %edx
> - jz L(shr_0)
> - xor %rdx, %rsi
> -
> -# ifndef USE_AS_WMEMCMP
> - cmp $8, %edx
> - jae L(next_unaligned_table)
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $1, %edx
> - je L(shr_1)
> - cmp $2, %edx
> - je L(shr_2)
> - cmp $3, %edx
> - je L(shr_3)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $5, %edx
> - je L(shr_5)
> - cmp $6, %edx
> - je L(shr_6)
> - jmp L(shr_7)
> -
> - .p2align 2
> -L(next_unaligned_table):
> - cmp $8, %edx
> - je L(shr_8)
> - cmp $9, %edx
> - je L(shr_9)
> - cmp $10, %edx
> - je L(shr_10)
> - cmp $11, %edx
> - je L(shr_11)
> - cmp $12, %edx
> - je L(shr_12)
> - cmp $13, %edx
> - je L(shr_13)
> - cmp $14, %edx
> - je L(shr_14)
> - jmp L(shr_15)
> -# else
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $8, %edx
> - je L(shr_8)
> - jmp L(shr_12)
> -# endif
> -
> - .p2align 4
> -L(shr_0):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - jae L(shr_0_gobble)
> - xor %eax, %eax
> - movdqa (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> - pand %xmm1, %xmm2
> - pmovmskb %xmm2, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_0_gobble):
> - movdqa (%rsi), %xmm0
> - xor %eax, %eax
> - pcmpeqb (%rdi), %xmm0
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> -L(shr_0_gobble_loop):
> - pand %xmm0, %xmm2
> - sub $32, %rcx
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - movdqa 32(%rsi), %xmm0
> - movdqa 48(%rsi), %xmm2
> - sbb $0xffff, %edx
> - pcmpeqb 32(%rdi), %xmm0
> - pcmpeqb 48(%rdi), %xmm2
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - jz L(shr_0_gobble_loop)
> -
> - pand %xmm0, %xmm2
> - cmp $0, %rcx
> - jge L(next)
> - inc %edx
> - add $32, %rcx
> -L(next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_1):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_1_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $1, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $1, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_1_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $1, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_1_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $1, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $1, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_1_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_1_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_1_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 1(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -
> - .p2align 4
> -L(shr_2):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_2_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $2, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $2, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_2_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $2, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_2_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $2, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $2, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_2_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_2_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_2_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 2(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_3_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $3, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $3, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $3, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_3_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $3, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $3, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_3_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_3_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_3_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 3(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_4):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_4_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $4, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $4, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_4_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $4, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_4_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $4, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $4, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_4_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_4_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_4_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 4(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_5):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_5_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $5, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $5, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_5_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $5, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_5_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $5, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $5, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_5_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_5_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_5_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 5(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_6_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $6, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $6, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $6, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_6_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $6, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $6, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_6_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_6_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_6_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 6(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_7_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $7, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $7, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $7, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_7_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $7, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $7, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_7_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_7_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_7_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 7(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_8):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_8_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $8, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $8, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_8_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $8, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_8_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $8, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $8, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_8_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_8_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_8_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 8(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_9):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_9_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $9, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $9, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_9_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $9, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_9_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $9, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $9, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_9_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_9_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_9_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 9(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_10_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $10, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $10, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $10, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_10_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $10, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $10, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_10_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_10_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_10_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 10(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_11_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $11, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $11, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $11, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_11_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $11, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $11, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_11_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_11_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_11_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 11(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_12):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_12_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $12, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $12, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_12_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $12, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_12_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $12, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $12, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_12_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_12_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_12_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 12(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_13):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_13_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $13, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $13, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_13_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $13, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_13_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $13, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $13, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_13_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_13_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_13_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 13(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_14_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $14, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $14, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $14, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_14_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $14, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $14, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_14_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_14_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_14_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 14(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_15_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $15, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $15, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $15, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_15_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $15, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $15, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_15_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_15_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_15_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 15(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -# endif
> - .p2align 4
> -L(exit):
> - pmovmskb %xmm1, %r8d
> - sub $0xffff, %r8d
> - jz L(first16bytes)
> - lea -16(%rsi), %rsi
> - lea -16(%rdi), %rdi
> - mov %r8d, %edx
> -L(first16bytes):
> - add %rax, %rsi
> -L(less16bytes):
> -# ifndef USE_AS_WMEMCMP
> - test %dl, %dl
> - jz L(next_24_bytes)
> -
> - test $0x01, %dl
> - jnz L(Byte16)
> -
> - test $0x02, %dl
> - jnz L(Byte17)
> -
> - test $0x04, %dl
> - jnz L(Byte18)
> -
> - test $0x08, %dl
> - jnz L(Byte19)
> -
> - test $0x10, %dl
> - jnz L(Byte20)
> -
> - test $0x20, %dl
> - jnz L(Byte21)
> -
> - test $0x40, %dl
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte16):
> - movzbl -16(%rdi), %eax
> - movzbl -16(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte17):
> - movzbl -15(%rdi), %eax
> - movzbl -15(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte18):
> - movzbl -14(%rdi), %eax
> - movzbl -14(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte19):
> - movzbl -13(%rdi), %eax
> - movzbl -13(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte20):
> - movzbl -12(%rdi), %eax
> - movzbl -12(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte21):
> - movzbl -11(%rdi), %eax
> - movzbl -11(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte22):
> - movzbl -10(%rdi), %eax
> - movzbl -10(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(next_24_bytes):
> - lea 8(%rdi), %rdi
> - lea 8(%rsi), %rsi
> - test $0x01, %dh
> - jnz L(Byte16)
> -
> - test $0x02, %dh
> - jnz L(Byte17)
> -
> - test $0x04, %dh
> - jnz L(Byte18)
> -
> - test $0x08, %dh
> - jnz L(Byte19)
> -
> - test $0x10, %dh
> - jnz L(Byte20)
> -
> - test $0x20, %dh
> - jnz L(Byte21)
> -
> - test $0x40, %dh
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -# else
> -/* special for wmemcmp */
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words)
> - and $15, %dl
> - jz L(second_double_word)
> - mov -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(second_double_word):
> - mov -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words):
> - and $15, %dh
> - jz L(fourth_double_word)
> - mov -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word):
> - mov -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> - ret
> -# endif
> -
> - .p2align 4
> -L(less48bytes):
> - cmp $8, %ecx
> - jae L(more8bytes)
> - cmp $0, %ecx
> - je L(0bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $1, %ecx
> - je L(1bytes)
> - cmp $2, %ecx
> - je L(2bytes)
> - cmp $3, %ecx
> - je L(3bytes)
> - cmp $4, %ecx
> - je L(4bytes)
> - cmp $5, %ecx
> - je L(5bytes)
> - cmp $6, %ecx
> - je L(6bytes)
> - jmp L(7bytes)
> -# else
> - jmp L(4bytes)
> -# endif
> -
> - .p2align 4
> -L(more8bytes):
> - cmp $16, %ecx
> - jae L(more16bytes)
> - cmp $8, %ecx
> - je L(8bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $9, %ecx
> - je L(9bytes)
> - cmp $10, %ecx
> - je L(10bytes)
> - cmp $11, %ecx
> - je L(11bytes)
> - cmp $12, %ecx
> - je L(12bytes)
> - cmp $13, %ecx
> - je L(13bytes)
> - cmp $14, %ecx
> - je L(14bytes)
> - jmp L(15bytes)
> -# else
> - jmp L(12bytes)
> -# endif
> -
> - .p2align 4
> -L(more16bytes):
> - cmp $24, %ecx
> - jae L(more24bytes)
> - cmp $16, %ecx
> - je L(16bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $17, %ecx
> - je L(17bytes)
> - cmp $18, %ecx
> - je L(18bytes)
> - cmp $19, %ecx
> - je L(19bytes)
> - cmp $20, %ecx
> - je L(20bytes)
> - cmp $21, %ecx
> - je L(21bytes)
> - cmp $22, %ecx
> - je L(22bytes)
> - jmp L(23bytes)
> -# else
> - jmp L(20bytes)
> -# endif
> -
> - .p2align 4
> -L(more24bytes):
> - cmp $32, %ecx
> - jae L(more32bytes)
> - cmp $24, %ecx
> - je L(24bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $25, %ecx
> - je L(25bytes)
> - cmp $26, %ecx
> - je L(26bytes)
> - cmp $27, %ecx
> - je L(27bytes)
> - cmp $28, %ecx
> - je L(28bytes)
> - cmp $29, %ecx
> - je L(29bytes)
> - cmp $30, %ecx
> - je L(30bytes)
> - jmp L(31bytes)
> -# else
> - jmp L(28bytes)
> -# endif
> -
> - .p2align 4
> -L(more32bytes):
> - cmp $40, %ecx
> - jae L(more40bytes)
> - cmp $32, %ecx
> - je L(32bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $33, %ecx
> - je L(33bytes)
> - cmp $34, %ecx
> - je L(34bytes)
> - cmp $35, %ecx
> - je L(35bytes)
> - cmp $36, %ecx
> - je L(36bytes)
> - cmp $37, %ecx
> - je L(37bytes)
> - cmp $38, %ecx
> - je L(38bytes)
> - jmp L(39bytes)
> -# else
> - jmp L(36bytes)
> -# endif
> -
> - .p2align 4
> -L(more40bytes):
> - cmp $40, %ecx
> - je L(40bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $41, %ecx
> - je L(41bytes)
> - cmp $42, %ecx
> - je L(42bytes)
> - cmp $43, %ecx
> - je L(43bytes)
> - cmp $44, %ecx
> - je L(44bytes)
> - cmp $45, %ecx
> - je L(45bytes)
> - cmp $46, %ecx
> - je L(46bytes)
> - jmp L(47bytes)
> -
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - movl -44(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - movl -40(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - movl -36(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - movl -32(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - movl -28(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - movl -24(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - movl -20(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - movl -16(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - movl -12(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - movl -8(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - movl -4(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# else
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - cmp -44(%rsi), %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - cmp -40(%rsi), %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - cmp -36(%rsi), %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - cmp -32(%rsi), %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - cmp -28(%rsi), %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - cmp -24(%rsi), %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - cmp -20(%rsi), %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# endif
> -
> -# ifndef USE_AS_WMEMCMP
> - .p2align 4
> -L(45bytes):
> - movl -45(%rdi), %eax
> - movl -45(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(41bytes):
> - movl -41(%rdi), %eax
> - movl -41(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(37bytes):
> - movl -37(%rdi), %eax
> - movl -37(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(33bytes):
> - movl -33(%rdi), %eax
> - movl -33(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(29bytes):
> - movl -29(%rdi), %eax
> - movl -29(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(25bytes):
> - movl -25(%rdi), %eax
> - movl -25(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(21bytes):
> - movl -21(%rdi), %eax
> - movl -21(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(17bytes):
> - movl -17(%rdi), %eax
> - movl -17(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(13bytes):
> - movl -13(%rdi), %eax
> - movl -13(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(9bytes):
> - movl -9(%rdi), %eax
> - movl -9(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(5bytes):
> - movl -5(%rdi), %eax
> - movl -5(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(1bytes):
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(46bytes):
> - movl -46(%rdi), %eax
> - movl -46(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(42bytes):
> - movl -42(%rdi), %eax
> - movl -42(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(38bytes):
> - movl -38(%rdi), %eax
> - movl -38(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(34bytes):
> - movl -34(%rdi), %eax
> - movl -34(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(30bytes):
> - movl -30(%rdi), %eax
> - movl -30(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(26bytes):
> - movl -26(%rdi), %eax
> - movl -26(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(22bytes):
> - movl -22(%rdi), %eax
> - movl -22(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(18bytes):
> - movl -18(%rdi), %eax
> - movl -18(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(14bytes):
> - movl -14(%rdi), %eax
> - movl -14(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(10bytes):
> - movl -10(%rdi), %eax
> - movl -10(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(6bytes):
> - movl -6(%rdi), %eax
> - movl -6(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(2bytes):
> - movzwl -2(%rdi), %eax
> - movzwl -2(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(47bytes):
> - movl -47(%rdi), %eax
> - movl -47(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(43bytes):
> - movl -43(%rdi), %eax
> - movl -43(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(39bytes):
> - movl -39(%rdi), %eax
> - movl -39(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(35bytes):
> - movl -35(%rdi), %eax
> - movl -35(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(31bytes):
> - movl -31(%rdi), %eax
> - movl -31(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(27bytes):
> - movl -27(%rdi), %eax
> - movl -27(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(23bytes):
> - movl -23(%rdi), %eax
> - movl -23(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(19bytes):
> - movl -19(%rdi), %eax
> - movl -19(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(15bytes):
> - movl -15(%rdi), %eax
> - movl -15(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(11bytes):
> - movl -11(%rdi), %eax
> - movl -11(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(7bytes):
> - movl -7(%rdi), %eax
> - movl -7(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(3bytes):
> - movzwl -3(%rdi), %eax
> - movzwl -3(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(find_diff):
> - cmpb %cl, %al
> - jne L(set)
> - cmpw %cx, %ax
> - jne L(set)
> - shr $16, %eax
> - shr $16, %ecx
> - cmpb %cl, %al
> - jne L(set)
> -
> -/* We get there only if we already know there is a
> -difference. */
> -
> - cmp %ecx, %eax
> -L(set):
> - sbb %eax, %eax
> - sbb $-1, %eax
> - ret
> -# else
> -
> -/* for wmemcmp */
> - .p2align 4
> -L(find_diff):
> - mov $1, %eax
> - jg L(find_diff_bigger)
> - neg %eax
> - ret
> -
> - .p2align 4
> -L(find_diff_bigger):
> - ret
> -# endif
> -
> - .p2align 4
> -L(equal):
> - xor %eax, %eax
> - ret
> -
> -END (MEMCMP)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> deleted file mode 100644
> index a41ef95fc1..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_ssse3
> -
> -#include "memcmp-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 19:55 ` H.J. Lu
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (8 subsequent siblings)
9 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:55 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
> sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
> sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
> sysdeps/x86_64/multiarch/strcmp.c | 4 -
> sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
> sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
> sysdeps/x86_64/multiarch/strncmp.c | 4 -
> sysdeps/x86_64/strcmp.S | 155 ++++--------------
> 10 files changed, 30 insertions(+), 202 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 51222dfab1..ed2def288d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -58,7 +58,6 @@ sysdep_routines += \
> strcasecmp_l-evex \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> - strcasecmp_l-ssse3 \
> strcat-avx2 \
> strcat-avx2-rtm \
> strcat-evex \
> @@ -80,7 +79,6 @@ sysdep_routines += \
> strcmp-sse2 \
> strcmp-sse2-unaligned \
> strcmp-sse4_2 \
> - strcmp-ssse3 \
> strcpy-avx2 \
> strcpy-avx2-rtm \
> strcpy-evex \
> @@ -98,7 +96,6 @@ sysdep_routines += \
> strncase_l-evex \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> - strncase_l-ssse3 \
> strncat-avx2 \
> strncat-avx2-rtm \
> strncat-c \
> @@ -110,7 +107,6 @@ sysdep_routines += \
> strncmp-evex \
> strncmp-sse2 \
> strncmp-sse4_2 \
> - strncmp-ssse3 \
> strncpy-avx2 \
> strncpy-avx2-rtm \
> strncpy-c \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f389928a4e..7e2be3554b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strcasecmp,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strcasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_l_sse42)
> - IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strcasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
> __strcasecmp_l_sse2))
>
> @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strcmp_evex)
> IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
> __strcmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
> - __strcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
>
> @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strncasecmp,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strncasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
> __strncasecmp_sse2))
>
> @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_l_sse42)
> - IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strncasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
> __strncasecmp_l_sse2))
>
> @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strncmp_evex)
> IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
> __strncmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
> - __strncmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>
> #ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 766539c241..296d32071b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -20,7 +20,6 @@
> #include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
> && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> return OPTIMIZE (sse42);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> deleted file mode 100644
> index fb2f9ae14a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strcasecmp_l_ssse3
> -#define __strcasecmp __strcasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> deleted file mode 100644
> index 1b7fa33c91..0000000000
> --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> +++ /dev/null
> @@ -1,5 +0,0 @@
> -#if IS_IN (libc)
> -# define USE_SSSE3 1
> -# define STRCMP __strcmp_ssse3
> -# include "../strcmp.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> index 68cb73baad..a248c2a6e6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.c
> +++ b/sysdeps/x86_64/multiarch/strcmp.c
> @@ -28,7 +28,6 @@
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> return OPTIMIZE (sse2_unaligned);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
>
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> deleted file mode 100644
> index 6728678688..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strncasecmp_l_ssse3
> -#define __strncasecmp __strncasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> deleted file mode 100644
> index ec37308347..0000000000
> --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strcmp optimized with SSSE3.
> - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#define STRCMP __strncmp_ssse3
> -
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strcmp)
> -
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCMP
> -#include <sysdeps/x86_64/strcmp.S>
> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> index fca74199d8..70ae6547c9 100644
> --- a/sysdeps/x86_64/multiarch/strncmp.c
> +++ b/sysdeps/x86_64/multiarch/strncmp.c
> @@ -27,7 +27,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
> && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> return OPTIMIZE (sse42);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index 99d8b36f1d..c38dc627f9 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -59,12 +59,7 @@
> # endif
> #endif
>
> -#ifndef USE_SSSE3
> .text
> -#else
> - .section .text.ssse3,"ax",@progbits
> -#endif
> -
> #ifdef USE_AS_STRCASECMP_L
> # ifndef ENTRY2
> # define ENTRY2(name) ENTRY (name)
> @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4 /* store for next cycle */
>
> -#ifndef USE_SSSE3
> psrldq $1, %xmm3
> pslldq $15, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4 /* store for next cycle */
>
> -#ifndef USE_SSSE3
> psrldq $1, %xmm3
> pslldq $15, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $2, %xmm3
> pslldq $14, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $2, %xmm3
> pslldq $14, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $3, %xmm3
> pslldq $13, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $3, %xmm3
> pslldq $13, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $4, %xmm3
> pslldq $12, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $4, %xmm3
> pslldq $12, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $5, %xmm3
> pslldq $11, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $5, %xmm3
> pslldq $11, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $6, %xmm3
> pslldq $10, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $6, %xmm3
> pslldq $10, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $7, %xmm3
> pslldq $9, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $7, %xmm3
> pslldq $9, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $8, %xmm3
> pslldq $8, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $8, %xmm3
> pslldq $8, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $9, %xmm3
> pslldq $7, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $9, %xmm3
> pslldq $7, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $10, %xmm3
> pslldq $6, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $10, %xmm3
> pslldq $6, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $11, %xmm3
> pslldq $5, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $11, %xmm3
> pslldq $5, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $12, %xmm3
> pslldq $4, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $12, %xmm3
> pslldq $4, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $13, %xmm3
> pslldq $3, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $13, %xmm3
> pslldq $3, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $14, %xmm3
> pslldq $2, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $14, %xmm3
> pslldq $2, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $15, %xmm3
> pslldq $1, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $15, %xmm3
> pslldq $1, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 19:56 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
> sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +-
> sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 --------------------
> sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 -
> 5 files changed, 7 insertions(+), 3183 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ed2def288d..48f81711ae 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,6 @@ sysdep_routines += \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> memcmpeq-sse2 \
> - memcpy-ssse3 \
> memcpy-ssse3-back \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> @@ -24,7 +23,6 @@ sysdep_routines += \
> memmove-avx512-unaligned-erms \
> memmove-evex-unaligned-erms \
> memmove-sse2-unaligned-erms \
> - memmove-ssse3 \
> memmove-ssse3-back \
> memrchr-avx2 \
> memrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7e2be3554b..70b0e9c62e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memmove_chk_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, __memmove_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memmove_chk_ssse3)
> IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> __memmove_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> @@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memmove_avx512_unaligned_erms)
> IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> __memmove_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> - __memmove_ssse3)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> IFUNC_IMPL_ADD (array, i, memmove, 1,
> __memmove_sse2_unaligned)
> @@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memcpy_chk_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_chk_ssse3)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> __memcpy_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> @@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memcpy_evex_unaligned_erms)
> IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> __memcpy_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, memcpy,
> CPU_FEATURE_USABLE (AVX512F),
> __memcpy_avx512_no_vzeroupper)
> @@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_chk_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_chk_ssse3)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> __mempcpy_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> @@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __mempcpy_evex_unaligned_erms)
> IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_ssse3_back)
> - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> __mempcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..1ecdd4b0d3 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
> attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
> attribute_hidden;
> @@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
> }
> }
>
> - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> - return OPTIMIZE (sse2_unaligned_erms);
> -
> - return OPTIMIZE (sse2_unaligned);
> + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> + return OPTIMIZE (ssse3_back);
> }
>
> - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> - return OPTIMIZE (ssse3_back);
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE (sse2_unaligned_erms);
>
> - return OPTIMIZE (ssse3);
> + return OPTIMIZE (sse2_unaligned);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> deleted file mode 100644
> index 65644d3a09..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> +++ /dev/null
> @@ -1,3151 +0,0 @@
> -/* memcpy with SSSE3
> - Copyright (C) 2010-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY __memcpy_ssse3
> -# define MEMCPY_CHK __memcpy_chk_ssse3
> -# define MEMPCPY __mempcpy_ssse3
> -# define MEMPCPY_CHK __mempcpy_chk_ssse3
> -#endif
> -
> -#define JMPTBL(I, B) I - B
> -
> -/* Branch to an entry in a jump table. TABLE is a jump table with
> - relative offsets. INDEX is a register contains the index into the
> - jump table. SCALE is the scale of INDEX. */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), INDEX; \
> - lea (%r11, INDEX), INDEX; \
> - _CET_NOTRACK jmp *INDEX; \
> - ud2
> -
> - .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> - mov %RDI_LP, %RAX_LP
> - add %RDX_LP, %RAX_LP
> - jmp L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> - mov %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> - add %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> - cmp %rsi, %rdi
> - jb L(copy_forward)
> - je L(write_0bytes)
> - cmp $79, %rdx
> - jbe L(copy_forward)
> - jmp L(copy_backward)
> -L(copy_forward):
> -#endif
> -L(start):
> - cmp $79, %rdx
> - lea L(table_less_80bytes)(%rip), %r11
> - ja L(80bytesormore)
> - movslq (%r11, %rdx, 4), %r9
> - add %rdx, %rsi
> - add %rdx, %rdi
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(80bytesormore):
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jle L(copy_backward)
> -#endif
> -
> - movdqu (%rsi), %xmm0
> - mov %rdi, %rcx
> - and $-16, %rdi
> - add $16, %rdi
> - mov %rcx, %r8
> - sub %rdi, %rcx
> - add %rcx, %rdx
> - sub %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> - cmp %rcx, %rdx
> - mov %rsi, %r9
> - ja L(large_page_fwd)
> - and $0xf, %r9
> - jz L(shl_0)
> -#ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
> -
> - .p2align 4
> -L(copy_backward):
> - movdqu -16(%rsi, %rdx), %xmm0
> - add %rdx, %rsi
> - lea -16(%rdi, %rdx), %r8
> - add %rdx, %rdi
> -
> - mov %rdi, %rcx
> - and $0xf, %rcx
> - xor %rcx, %rdi
> - sub %rcx, %rdx
> - sub %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -
> - cmp %rcx, %rdx
> - mov %rsi, %r9
> - ja L(large_page_bwd)
> - and $0xf, %r9
> - jz L(shl_0_bwd)
> -#ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
> -
> - .p2align 4
> -L(shl_0):
> - sub $16, %rdx
> - movdqa (%rsi), %xmm1
> - add $16, %rsi
> - movdqa %xmm1, (%rdi)
> - add $16, %rdi
> - cmp $128, %rdx
> - movdqu %xmm0, (%r8)
> - ja L(shl_0_gobble)
> - cmp $64, %rdx
> - jb L(shl_0_less_64bytes)
> - movaps (%rsi), %xmm4
> - movaps 16(%rsi), %xmm1
> - movaps 32(%rsi), %xmm2
> - movaps 48(%rsi), %xmm3
> - movaps %xmm4, (%rdi)
> - movaps %xmm1, 16(%rdi)
> - movaps %xmm2, 32(%rdi)
> - movaps %xmm3, 48(%rdi)
> - sub $64, %rdx
> - add $64, %rsi
> - add $64, %rdi
> -L(shl_0_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble):
> -#ifdef DATA_CACHE_SIZE_HALF
> - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> - lea -128(%rdx), %rdx
> - jae L(shl_0_gobble_mem_loop)
> -L(shl_0_gobble_cache_loop):
> - movdqa (%rsi), %xmm4
> - movaps 0x10(%rsi), %xmm1
> - movaps 0x20(%rsi), %xmm2
> - movaps 0x30(%rsi), %xmm3
> -
> - movdqa %xmm4, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> -
> - sub $128, %rdx
> - movaps 0x40(%rsi), %xmm4
> - movaps 0x50(%rsi), %xmm5
> - movaps 0x60(%rsi), %xmm6
> - movaps 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> - movaps %xmm4, 0x40(%rdi)
> - movaps %xmm5, 0x50(%rdi)
> - movaps %xmm6, 0x60(%rdi)
> - movaps %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_cache_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_cache_less_64bytes)
> -
> - movdqa (%rsi), %xmm4
> - sub $0x40, %rdx
> - movdqa 0x10(%rsi), %xmm1
> -
> - movdqa %xmm4, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> -
> - movdqa 0x20(%rsi), %xmm4
> - movdqa 0x30(%rsi), %xmm1
> - add $0x40, %rsi
> -
> - movdqa %xmm4, 0x20(%rdi)
> - movdqa %xmm1, 0x30(%rdi)
> - add $0x40, %rdi
> -L(shl_0_cache_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_mem_loop):
> - prefetcht0 0x1c0(%rsi)
> - prefetcht0 0x280(%rsi)
> -
> - movdqa (%rsi), %xmm0
> - movdqa 0x10(%rsi), %xmm1
> - movdqa 0x20(%rsi), %xmm2
> - movdqa 0x30(%rsi), %xmm3
> - movdqa 0x40(%rsi), %xmm4
> - movdqa 0x50(%rsi), %xmm5
> - movdqa 0x60(%rsi), %xmm6
> - movdqa 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> - sub $0x80, %rdx
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - movdqa %xmm2, 0x20(%rdi)
> - movdqa %xmm3, 0x30(%rdi)
> - movdqa %xmm4, 0x40(%rdi)
> - movdqa %xmm5, 0x50(%rdi)
> - movdqa %xmm6, 0x60(%rdi)
> - movdqa %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_mem_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_mem_less_64bytes)
> -
> - movdqa (%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa 0x10(%rsi), %xmm1
> -
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> -
> - movdqa 0x20(%rsi), %xmm0
> - movdqa 0x30(%rsi), %xmm1
> - add $0x40, %rsi
> -
> - movdqa %xmm0, 0x20(%rdi)
> - movdqa %xmm1, 0x30(%rdi)
> - add $0x40, %rdi
> -L(shl_0_mem_less_64bytes):
> - cmp $0x20, %rdx
> - jb L(shl_0_mem_less_32bytes)
> - movdqa (%rsi), %xmm0
> - sub $0x20, %rdx
> - movdqa 0x10(%rsi), %xmm1
> - add $0x20, %rsi
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - add $0x20, %rdi
> -L(shl_0_mem_less_32bytes):
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_bwd):
> - sub $16, %rdx
> - movdqa -0x10(%rsi), %xmm1
> - sub $16, %rsi
> - movdqa %xmm1, -0x10(%rdi)
> - sub $16, %rdi
> - cmp $0x80, %rdx
> - movdqu %xmm0, (%r8)
> - ja L(shl_0_gobble_bwd)
> - cmp $64, %rdx
> - jb L(shl_0_less_64bytes_bwd)
> - movaps -0x10(%rsi), %xmm0
> - movaps -0x20(%rsi), %xmm1
> - movaps -0x30(%rsi), %xmm2
> - movaps -0x40(%rsi), %xmm3
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - sub $64, %rdx
> - sub $0x40, %rsi
> - sub $0x40, %rdi
> -L(shl_0_less_64bytes_bwd):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_bwd):
> -#ifdef DATA_CACHE_SIZE_HALF
> - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> - lea -128(%rdx), %rdx
> - jae L(shl_0_gobble_mem_bwd_loop)
> -L(shl_0_gobble_bwd_loop):
> - movdqa -0x10(%rsi), %xmm0
> - movaps -0x20(%rsi), %xmm1
> - movaps -0x30(%rsi), %xmm2
> - movaps -0x40(%rsi), %xmm3
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> -
> - sub $0x80, %rdx
> - movaps -0x50(%rsi), %xmm4
> - movaps -0x60(%rsi), %xmm5
> - movaps -0x70(%rsi), %xmm6
> - movaps -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> - movaps %xmm4, -0x50(%rdi)
> - movaps %xmm5, -0x60(%rdi)
> - movaps %xmm6, -0x70(%rdi)
> - movaps %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_gobble_bwd_less_64bytes)
> -
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa -0x20(%rsi), %xmm1
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> -
> - movdqa -0x30(%rsi), %xmm0
> - movdqa -0x40(%rsi), %xmm1
> - sub $0x40, %rsi
> -
> - movdqa %xmm0, -0x30(%rdi)
> - movdqa %xmm1, -0x40(%rdi)
> - sub $0x40, %rdi
> -L(shl_0_gobble_bwd_less_64bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_mem_bwd_loop):
> - prefetcht0 -0x1c0(%rsi)
> - prefetcht0 -0x280(%rsi)
> - movdqa -0x10(%rsi), %xmm0
> - movdqa -0x20(%rsi), %xmm1
> - movdqa -0x30(%rsi), %xmm2
> - movdqa -0x40(%rsi), %xmm3
> - movdqa -0x50(%rsi), %xmm4
> - movdqa -0x60(%rsi), %xmm5
> - movdqa -0x70(%rsi), %xmm6
> - movdqa -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> - sub $0x80, %rdx
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> - movdqa %xmm2, -0x30(%rdi)
> - movdqa %xmm3, -0x40(%rdi)
> - movdqa %xmm4, -0x50(%rdi)
> - movdqa %xmm5, -0x60(%rdi)
> - movdqa %xmm6, -0x70(%rdi)
> - movdqa %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_mem_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_mem_bwd_less_64bytes)
> -
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa -0x20(%rsi), %xmm1
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> -
> - movdqa -0x30(%rsi), %xmm0
> - movdqa -0x40(%rsi), %xmm1
> - sub $0x40, %rsi
> -
> - movdqa %xmm0, -0x30(%rdi)
> - movdqa %xmm1, -0x40(%rdi)
> - sub $0x40, %rdi
> -L(shl_0_mem_bwd_less_64bytes):
> - cmp $0x20, %rdx
> - jb L(shl_0_mem_bwd_less_32bytes)
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x20, %rdx
> - movdqa -0x20(%rsi), %xmm1
> - sub $0x20, %rsi
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> - sub $0x20, %rdi
> -L(shl_0_mem_bwd_less_32bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1):
> - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x01(%rsi), %xmm1
> - jb L(L1_fwd)
> - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
> -L(L1_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_1_loop_L1):
> - sub $64, %rdx
> - movaps 0x0f(%rsi), %xmm2
> - movaps 0x1f(%rsi), %xmm3
> - movaps 0x2f(%rsi), %xmm4
> - movaps 0x3f(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $1, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $1, %xmm3, %xmm4
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $1, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_1_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1_bwd):
> - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x01(%rsi), %xmm1
> - jb L(L1_bwd)
> - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
> -L(L1_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_1_bwd_loop_L1):
> - movaps -0x11(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x21(%rsi), %xmm3
> - movaps -0x31(%rsi), %xmm4
> - movaps -0x41(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $1, %xmm2, %xmm1
> - palignr $1, %xmm3, %xmm2
> - palignr $1, %xmm4, %xmm3
> - palignr $1, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_1_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2):
> - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x02(%rsi), %xmm1
> - jb L(L2_fwd)
> - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
> -L(L2_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_2_loop_L1):
> - sub $64, %rdx
> - movaps 0x0e(%rsi), %xmm2
> - movaps 0x1e(%rsi), %xmm3
> - movaps 0x2e(%rsi), %xmm4
> - movaps 0x3e(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $2, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $2, %xmm3, %xmm4
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $2, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_2_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2_bwd):
> - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x02(%rsi), %xmm1
> - jb L(L2_bwd)
> - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
> -L(L2_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_2_bwd_loop_L1):
> - movaps -0x12(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x22(%rsi), %xmm3
> - movaps -0x32(%rsi), %xmm4
> - movaps -0x42(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $2, %xmm2, %xmm1
> - palignr $2, %xmm3, %xmm2
> - palignr $2, %xmm4, %xmm3
> - palignr $2, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_2_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3):
> - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x03(%rsi), %xmm1
> - jb L(L3_fwd)
> - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
> -L(L3_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_3_loop_L1):
> - sub $64, %rdx
> - movaps 0x0d(%rsi), %xmm2
> - movaps 0x1d(%rsi), %xmm3
> - movaps 0x2d(%rsi), %xmm4
> - movaps 0x3d(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $3, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $3, %xmm3, %xmm4
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $3, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_3_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3_bwd):
> - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x03(%rsi), %xmm1
> - jb L(L3_bwd)
> - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
> -L(L3_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_3_bwd_loop_L1):
> - movaps -0x13(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x23(%rsi), %xmm3
> - movaps -0x33(%rsi), %xmm4
> - movaps -0x43(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $3, %xmm2, %xmm1
> - palignr $3, %xmm3, %xmm2
> - palignr $3, %xmm4, %xmm3
> - palignr $3, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_3_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4):
> - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x04(%rsi), %xmm1
> - jb L(L4_fwd)
> - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
> -L(L4_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_4_loop_L1):
> - sub $64, %rdx
> - movaps 0x0c(%rsi), %xmm2
> - movaps 0x1c(%rsi), %xmm3
> - movaps 0x2c(%rsi), %xmm4
> - movaps 0x3c(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $4, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $4, %xmm3, %xmm4
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $4, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_4_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4_bwd):
> - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x04(%rsi), %xmm1
> - jb L(L4_bwd)
> - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
> -L(L4_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_4_bwd_loop_L1):
> - movaps -0x14(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x24(%rsi), %xmm3
> - movaps -0x34(%rsi), %xmm4
> - movaps -0x44(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $4, %xmm2, %xmm1
> - palignr $4, %xmm3, %xmm2
> - palignr $4, %xmm4, %xmm3
> - palignr $4, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_4_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5):
> - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x05(%rsi), %xmm1
> - jb L(L5_fwd)
> - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
> -L(L5_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_5_loop_L1):
> - sub $64, %rdx
> - movaps 0x0b(%rsi), %xmm2
> - movaps 0x1b(%rsi), %xmm3
> - movaps 0x2b(%rsi), %xmm4
> - movaps 0x3b(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $5, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $5, %xmm3, %xmm4
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $5, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_5_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5_bwd):
> - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x05(%rsi), %xmm1
> - jb L(L5_bwd)
> - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
> -L(L5_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_5_bwd_loop_L1):
> - movaps -0x15(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x25(%rsi), %xmm3
> - movaps -0x35(%rsi), %xmm4
> - movaps -0x45(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $5, %xmm2, %xmm1
> - palignr $5, %xmm3, %xmm2
> - palignr $5, %xmm4, %xmm3
> - palignr $5, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_5_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6):
> - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x06(%rsi), %xmm1
> - jb L(L6_fwd)
> - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
> -L(L6_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_6_loop_L1):
> - sub $64, %rdx
> - movaps 0x0a(%rsi), %xmm2
> - movaps 0x1a(%rsi), %xmm3
> - movaps 0x2a(%rsi), %xmm4
> - movaps 0x3a(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $6, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $6, %xmm3, %xmm4
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $6, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_6_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6_bwd):
> - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x06(%rsi), %xmm1
> - jb L(L6_bwd)
> - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
> -L(L6_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_6_bwd_loop_L1):
> - movaps -0x16(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x26(%rsi), %xmm3
> - movaps -0x36(%rsi), %xmm4
> - movaps -0x46(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $6, %xmm2, %xmm1
> - palignr $6, %xmm3, %xmm2
> - palignr $6, %xmm4, %xmm3
> - palignr $6, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_6_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7):
> - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x07(%rsi), %xmm1
> - jb L(L7_fwd)
> - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
> -L(L7_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_7_loop_L1):
> - sub $64, %rdx
> - movaps 0x09(%rsi), %xmm2
> - movaps 0x19(%rsi), %xmm3
> - movaps 0x29(%rsi), %xmm4
> - movaps 0x39(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $7, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $7, %xmm3, %xmm4
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $7, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_7_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7_bwd):
> - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x07(%rsi), %xmm1
> - jb L(L7_bwd)
> - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
> -L(L7_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_7_bwd_loop_L1):
> - movaps -0x17(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x27(%rsi), %xmm3
> - movaps -0x37(%rsi), %xmm4
> - movaps -0x47(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $7, %xmm2, %xmm1
> - palignr $7, %xmm3, %xmm2
> - palignr $7, %xmm4, %xmm3
> - palignr $7, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_7_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8):
> - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x08(%rsi), %xmm1
> - jb L(L8_fwd)
> - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
> -L(L8_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> -L(shl_8_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_8_loop_L1):
> - sub $64, %rdx
> - movaps 0x08(%rsi), %xmm2
> - movaps 0x18(%rsi), %xmm3
> - movaps 0x28(%rsi), %xmm4
> - movaps 0x38(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $8, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $8, %xmm3, %xmm4
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $8, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_8_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> - .p2align 4
> -L(shl_8_end):
> - lea 64(%rdx), %rdx
> - movaps %xmm4, -0x20(%rdi)
> - add %rdx, %rsi
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8_bwd):
> - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x08(%rsi), %xmm1
> - jb L(L8_bwd)
> - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
> -L(L8_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_8_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_8_bwd_loop_L1):
> - movaps -0x18(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x28(%rsi), %xmm3
> - movaps -0x38(%rsi), %xmm4
> - movaps -0x48(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $8, %xmm2, %xmm1
> - palignr $8, %xmm3, %xmm2
> - palignr $8, %xmm4, %xmm3
> - palignr $8, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_8_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_8_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9):
> - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x09(%rsi), %xmm1
> - jb L(L9_fwd)
> - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
> -L(L9_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_9_loop_L1):
> - sub $64, %rdx
> - movaps 0x07(%rsi), %xmm2
> - movaps 0x17(%rsi), %xmm3
> - movaps 0x27(%rsi), %xmm4
> - movaps 0x37(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $9, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $9, %xmm3, %xmm4
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $9, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_9_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9_bwd):
> - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x09(%rsi), %xmm1
> - jb L(L9_bwd)
> - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
> -L(L9_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_9_bwd_loop_L1):
> - movaps -0x19(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x29(%rsi), %xmm3
> - movaps -0x39(%rsi), %xmm4
> - movaps -0x49(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $9, %xmm2, %xmm1
> - palignr $9, %xmm3, %xmm2
> - palignr $9, %xmm4, %xmm3
> - palignr $9, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_9_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10):
> - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - jb L(L10_fwd)
> - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
> -L(L10_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_10_loop_L1):
> - sub $64, %rdx
> - movaps 0x06(%rsi), %xmm2
> - movaps 0x16(%rsi), %xmm3
> - movaps 0x26(%rsi), %xmm4
> - movaps 0x36(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $10, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $10, %xmm3, %xmm4
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $10, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_10_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10_bwd):
> - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - jb L(L10_bwd)
> - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
> -L(L10_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_10_bwd_loop_L1):
> - movaps -0x1a(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2a(%rsi), %xmm3
> - movaps -0x3a(%rsi), %xmm4
> - movaps -0x4a(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $10, %xmm2, %xmm1
> - palignr $10, %xmm3, %xmm2
> - palignr $10, %xmm4, %xmm3
> - palignr $10, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_10_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11):
> - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - jb L(L11_fwd)
> - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
> -L(L11_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_11_loop_L1):
> - sub $64, %rdx
> - movaps 0x05(%rsi), %xmm2
> - movaps 0x15(%rsi), %xmm3
> - movaps 0x25(%rsi), %xmm4
> - movaps 0x35(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $11, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $11, %xmm3, %xmm4
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $11, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_11_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11_bwd):
> - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - jb L(L11_bwd)
> - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
> -L(L11_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_11_bwd_loop_L1):
> - movaps -0x1b(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2b(%rsi), %xmm3
> - movaps -0x3b(%rsi), %xmm4
> - movaps -0x4b(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $11, %xmm2, %xmm1
> - palignr $11, %xmm3, %xmm2
> - palignr $11, %xmm4, %xmm3
> - palignr $11, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_11_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12):
> - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0c(%rsi), %xmm1
> - jb L(L12_fwd)
> - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
> -L(L12_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_12_loop_L1):
> - sub $64, %rdx
> - movaps 0x04(%rsi), %xmm2
> - movaps 0x14(%rsi), %xmm3
> - movaps 0x24(%rsi), %xmm4
> - movaps 0x34(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $12, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $12, %xmm3, %xmm4
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $12, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_12_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12_bwd):
> - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0c(%rsi), %xmm1
> - jb L(L12_bwd)
> - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
> -L(L12_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_12_bwd_loop_L1):
> - movaps -0x1c(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2c(%rsi), %xmm3
> - movaps -0x3c(%rsi), %xmm4
> - movaps -0x4c(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $12, %xmm2, %xmm1
> - palignr $12, %xmm3, %xmm2
> - palignr $12, %xmm4, %xmm3
> - palignr $12, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_12_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13):
> - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - jb L(L13_fwd)
> - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
> -L(L13_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_13_loop_L1):
> - sub $64, %rdx
> - movaps 0x03(%rsi), %xmm2
> - movaps 0x13(%rsi), %xmm3
> - movaps 0x23(%rsi), %xmm4
> - movaps 0x33(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $13, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $13, %xmm3, %xmm4
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $13, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_13_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13_bwd):
> - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - jb L(L13_bwd)
> - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
> -L(L13_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_13_bwd_loop_L1):
> - movaps -0x1d(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2d(%rsi), %xmm3
> - movaps -0x3d(%rsi), %xmm4
> - movaps -0x4d(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $13, %xmm2, %xmm1
> - palignr $13, %xmm3, %xmm2
> - palignr $13, %xmm4, %xmm3
> - palignr $13, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_13_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14):
> - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - jb L(L14_fwd)
> - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
> -L(L14_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_14_loop_L1):
> - sub $64, %rdx
> - movaps 0x02(%rsi), %xmm2
> - movaps 0x12(%rsi), %xmm3
> - movaps 0x22(%rsi), %xmm4
> - movaps 0x32(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $14, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $14, %xmm3, %xmm4
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $14, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_14_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14_bwd):
> - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - jb L(L14_bwd)
> - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
> -L(L14_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_14_bwd_loop_L1):
> - movaps -0x1e(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2e(%rsi), %xmm3
> - movaps -0x3e(%rsi), %xmm4
> - movaps -0x4e(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $14, %xmm2, %xmm1
> - palignr $14, %xmm3, %xmm2
> - palignr $14, %xmm4, %xmm3
> - palignr $14, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_14_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15):
> - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - jb L(L15_fwd)
> - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
> -L(L15_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_15_loop_L1):
> - sub $64, %rdx
> - movaps 0x01(%rsi), %xmm2
> - movaps 0x11(%rsi), %xmm3
> - movaps 0x21(%rsi), %xmm4
> - movaps 0x31(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $15, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $15, %xmm3, %xmm4
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $15, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_15_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15_bwd):
> - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - jb L(L15_bwd)
> - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
> -L(L15_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_15_bwd_loop_L1):
> - movaps -0x1f(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2f(%rsi), %xmm3
> - movaps -0x3f(%rsi), %xmm4
> - movaps -0x4f(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $15, %xmm2, %xmm1
> - palignr $15, %xmm3, %xmm2
> - palignr $15, %xmm4, %xmm3
> - palignr $15, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_15_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(write_72bytes):
> - movdqu -72(%rsi), %xmm0
> - movdqu -56(%rsi), %xmm1
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rcx
> - movdqu %xmm0, -72(%rdi)
> - movdqu %xmm1, -56(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_64bytes):
> - movdqu -64(%rsi), %xmm0
> - mov -48(%rsi), %rcx
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -64(%rdi)
> - mov %rcx, -48(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_56bytes):
> - movdqu -56(%rsi), %xmm0
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rcx
> - movdqu %xmm0, -56(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_48bytes):
> - mov -48(%rsi), %rcx
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %rcx, -48(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_40bytes):
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_32bytes):
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_24bytes):
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_16bytes):
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_8bytes):
> - mov -8(%rsi), %rdx
> - mov %rdx, -8(%rdi)
> -L(write_0bytes):
> - ret
> -
> - .p2align 4
> -L(write_73bytes):
> - movdqu -73(%rsi), %xmm0
> - movdqu -57(%rsi), %xmm1
> - mov -41(%rsi), %rcx
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %r8
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -73(%rdi)
> - movdqu %xmm1, -57(%rdi)
> - mov %rcx, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %r8, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_65bytes):
> - movdqu -65(%rsi), %xmm0
> - movdqu -49(%rsi), %xmm1
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -65(%rdi)
> - movdqu %xmm1, -49(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_57bytes):
> - movdqu -57(%rsi), %xmm0
> - mov -41(%rsi), %r8
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -57(%rdi)
> - mov %r8, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_49bytes):
> - movdqu -49(%rsi), %xmm0
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -49(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_41bytes):
> - mov -41(%rsi), %r8
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r8, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_33bytes):
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_25bytes):
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_17bytes):
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_9bytes):
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_1bytes):
> - mov -1(%rsi), %dl
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_74bytes):
> - movdqu -74(%rsi), %xmm0
> - movdqu -58(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -74(%rdi)
> - movdqu %xmm1, -58(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_66bytes):
> - movdqu -66(%rsi), %xmm0
> - movdqu -50(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -66(%rdi)
> - movdqu %xmm1, -50(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_58bytes):
> - movdqu -58(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm1, -58(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_50bytes):
> - movdqu -50(%rsi), %xmm0
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -50(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_42bytes):
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_34bytes):
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_26bytes):
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_18bytes):
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_10bytes):
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_2bytes):
> - mov -2(%rsi), %dx
> - mov %dx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_75bytes):
> - movdqu -75(%rsi), %xmm0
> - movdqu -59(%rsi), %xmm1
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -75(%rdi)
> - movdqu %xmm1, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_67bytes):
> - movdqu -67(%rsi), %xmm0
> - movdqu -59(%rsi), %xmm1
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -67(%rdi)
> - movdqu %xmm1, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_59bytes):
> - movdqu -59(%rsi), %xmm0
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_51bytes):
> - movdqu -51(%rsi), %xmm0
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -51(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_43bytes):
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_35bytes):
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_27bytes):
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_19bytes):
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_11bytes):
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_3bytes):
> - mov -3(%rsi), %dx
> - mov -2(%rsi), %cx
> - mov %dx, -3(%rdi)
> - mov %cx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_76bytes):
> - movdqu -76(%rsi), %xmm0
> - movdqu -60(%rsi), %xmm1
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -76(%rdi)
> - movdqu %xmm1, -60(%rdi)
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_68bytes):
> - movdqu -68(%rsi), %xmm0
> - movdqu -52(%rsi), %xmm1
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -68(%rdi)
> - movdqu %xmm1, -52(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_60bytes):
> - movdqu -60(%rsi), %xmm0
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -60(%rdi)
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_52bytes):
> - movdqu -52(%rsi), %xmm0
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -52(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_44bytes):
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_36bytes):
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_28bytes):
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_20bytes):
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_12bytes):
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_4bytes):
> - mov -4(%rsi), %edx
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_77bytes):
> - movdqu -77(%rsi), %xmm0
> - movdqu -61(%rsi), %xmm1
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -77(%rdi)
> - movdqu %xmm1, -61(%rdi)
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_69bytes):
> - movdqu -69(%rsi), %xmm0
> - movdqu -53(%rsi), %xmm1
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -69(%rdi)
> - movdqu %xmm1, -53(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_61bytes):
> - movdqu -61(%rsi), %xmm0
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -61(%rdi)
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_53bytes):
> - movdqu -53(%rsi), %xmm0
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -53(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_45bytes):
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_37bytes):
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_29bytes):
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_21bytes):
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_13bytes):
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_5bytes):
> - mov -5(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -5(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_78bytes):
> - movdqu -78(%rsi), %xmm0
> - movdqu -62(%rsi), %xmm1
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -78(%rdi)
> - movdqu %xmm1, -62(%rdi)
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_70bytes):
> - movdqu -70(%rsi), %xmm0
> - movdqu -54(%rsi), %xmm1
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -70(%rdi)
> - movdqu %xmm1, -54(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_62bytes):
> - movdqu -62(%rsi), %xmm0
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -62(%rdi)
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_54bytes):
> - movdqu -54(%rsi), %xmm0
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -54(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_46bytes):
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_38bytes):
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_30bytes):
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_22bytes):
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_14bytes):
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_6bytes):
> - mov -6(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -6(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_79bytes):
> - movdqu -79(%rsi), %xmm0
> - movdqu -63(%rsi), %xmm1
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -79(%rdi)
> - movdqu %xmm1, -63(%rdi)
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_71bytes):
> - movdqu -71(%rsi), %xmm0
> - movdqu -55(%rsi), %xmm1
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -71(%rdi)
> - movdqu %xmm1, -55(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_63bytes):
> - movdqu -63(%rsi), %xmm0
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -63(%rdi)
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_55bytes):
> - movdqu -55(%rsi), %xmm0
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -55(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_47bytes):
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_39bytes):
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_31bytes):
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_23bytes):
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_15bytes):
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_7bytes):
> - mov -7(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -7(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(large_page_fwd):
> - movdqu (%rsi), %xmm1
> - lea 16(%rsi), %rsi
> - movdqu %xmm0, (%r8)
> - movntdq %xmm1, (%rdi)
> - lea 16(%rdi), %rdi
> - lea -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> - mov %rsi, %r9
> - sub %rdi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_fwd)
> - shl $2, %rcx
> - cmp %rcx, %rdx
> - jb L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -L(large_page_loop):
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - movntdq %xmm4, 0x40(%rdi)
> - movntdq %xmm5, 0x50(%rdi)
> - movntdq %xmm6, 0x60(%rdi)
> - movntdq %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(large_page_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_less_64bytes)
> -
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - lea 0x40(%rsi), %rsi
> -
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - lea 0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - sfence
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> - .p2align 4
> -L(ll_cache_copy_fwd_start):
> - prefetcht0 0x1c0(%rsi)
> - prefetcht0 0x200(%rsi)
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movaps %xmm0, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> - movaps %xmm4, 0x40(%rdi)
> - movaps %xmm5, 0x50(%rdi)
> - movaps %xmm6, 0x60(%rdi)
> - movaps %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(ll_cache_copy_fwd_start)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_ll_less_fwd_64bytes)
> -
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - lea 0x40(%rsi), %rsi
> -
> - movaps %xmm0, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> - lea 0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_ll_less_fwd_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#endif
> - .p2align 4
> -L(large_page_bwd):
> - movdqu -0x10(%rsi), %xmm1
> - lea -16(%rsi), %rsi
> - movdqu %xmm0, (%r8)
> - movdqa %xmm1, -0x10(%rdi)
> - lea -16(%rdi), %rdi
> - lea -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> - mov %rdi, %r9
> - sub %rsi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_bwd)
> - cmp %rcx, %r9
> - jb L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -L(large_page_bwd_loop):
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - movdqu -0x50(%rsi), %xmm4
> - movdqu -0x60(%rsi), %xmm5
> - movdqu -0x70(%rsi), %xmm6
> - movdqu -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movntdq %xmm0, -0x10(%rdi)
> - movntdq %xmm1, -0x20(%rdi)
> - movntdq %xmm2, -0x30(%rdi)
> - movntdq %xmm3, -0x40(%rdi)
> - movntdq %xmm4, -0x50(%rdi)
> - movntdq %xmm5, -0x60(%rdi)
> - movntdq %xmm6, -0x70(%rdi)
> - movntdq %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> - jae L(large_page_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_less_bwd_64bytes)
> -
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - lea -0x40(%rsi), %rsi
> -
> - movntdq %xmm0, -0x10(%rdi)
> - movntdq %xmm1, -0x20(%rdi)
> - movntdq %xmm2, -0x30(%rdi)
> - movntdq %xmm3, -0x40(%rdi)
> - lea -0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_less_bwd_64bytes):
> - sfence
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> - .p2align 4
> -L(ll_cache_copy_bwd_start):
> - prefetcht0 -0x1c0(%rsi)
> - prefetcht0 -0x200(%rsi)
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - movdqu -0x50(%rsi), %xmm4
> - movdqu -0x60(%rsi), %xmm5
> - movdqu -0x70(%rsi), %xmm6
> - movdqu -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - movaps %xmm4, -0x50(%rdi)
> - movaps %xmm5, -0x60(%rdi)
> - movaps %xmm6, -0x70(%rdi)
> - movaps %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> - jae L(ll_cache_copy_bwd_start)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_ll_less_bwd_64bytes)
> -
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - lea -0x40(%rsi), %rsi
> -
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - lea -0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_ll_less_bwd_64bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -#endif
> -
> -END (MEMCPY)
> -
> - .section .rodata.ssse3,"a",@progbits
> - .p2align 3
> -L(table_less_80bytes):
> - .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
> -
> - .p2align 3
> -L(shl_table):
> - .int JMPTBL (L(shl_0), L(shl_table))
> - .int JMPTBL (L(shl_1), L(shl_table))
> - .int JMPTBL (L(shl_2), L(shl_table))
> - .int JMPTBL (L(shl_3), L(shl_table))
> - .int JMPTBL (L(shl_4), L(shl_table))
> - .int JMPTBL (L(shl_5), L(shl_table))
> - .int JMPTBL (L(shl_6), L(shl_table))
> - .int JMPTBL (L(shl_7), L(shl_table))
> - .int JMPTBL (L(shl_8), L(shl_table))
> - .int JMPTBL (L(shl_9), L(shl_table))
> - .int JMPTBL (L(shl_10), L(shl_table))
> - .int JMPTBL (L(shl_11), L(shl_table))
> - .int JMPTBL (L(shl_12), L(shl_table))
> - .int JMPTBL (L(shl_13), L(shl_table))
> - .int JMPTBL (L(shl_14), L(shl_table))
> - .int JMPTBL (L(shl_15), L(shl_table))
> -
> - .p2align 3
> -L(shl_table_bwd):
> - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> deleted file mode 100644
> index 295430b1ef..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY __memmove_ssse3
> -#define MEMCPY_CHK __memmove_chk_ssse3
> -#include "memcpy-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 19:56 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
> sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 -
> sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
> sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
> 5 files changed, 3209 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 48f81711ae..323be3b969 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,14 +16,12 @@ sysdep_routines += \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> memcmpeq-sse2 \
> - memcpy-ssse3-back \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> memmove-avx512-unaligned-erms \
> memmove-evex-unaligned-erms \
> memmove-sse2-unaligned-erms \
> - memmove-ssse3-back \
> memrchr-avx2 \
> memrchr-avx2-rtm \
> memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 70b0e9c62e..d6852ab365 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memmove_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memmove_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> __memmove_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> @@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_avx512_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> - __memmove_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> IFUNC_IMPL_ADD (array, i, memmove, 1,
> __memmove_sse2_unaligned)
> @@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> __memcpy_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> @@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memcpy,
> CPU_FEATURE_USABLE (AVX512F),
> __memcpy_avx512_no_vzeroupper)
> @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> __mempcpy_chk_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> @@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, mempcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> __mempcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index 1ecdd4b0d3..5596ddea2c 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
> }
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> - && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> - {
> - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> - return OPTIMIZE (ssse3_back);
> - }
> -
> if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> return OPTIMIZE (sse2_unaligned_erms);
>
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> - Copyright (C) 2010-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY __memcpy_ssse3_back
> -# define MEMCPY_CHK __memcpy_chk_ssse3_back
> -# define MEMPCPY __mempcpy_ssse3_back
> -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B) I - B
> -
> -/* Branch to an entry in a jump table. TABLE is a jump table with
> - relative offsets. INDEX is a register contains the index into the
> - jump table. SCALE is the scale of INDEX. */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), INDEX; \
> - lea (%r11, INDEX), INDEX; \
> - _CET_NOTRACK jmp *INDEX; \
> - ud2
> -
> - .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> - mov %RDI_LP, %RAX_LP
> - add %RDX_LP, %RAX_LP
> - jmp L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> - mov %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> - add %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> - cmp %rsi, %rdi
> - jb L(copy_forward)
> - je L(bwd_write_0bytes)
> - cmp $144, %rdx
> - jae L(copy_backward)
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> - cmp $144, %rdx
> - jae L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jbe L(bk_write)
> -#endif
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> - .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jle L(copy_backward)
> -#endif
> - movdqu (%rsi), %xmm0
> - mov %rdi, %r8
> - and $-16, %rdi
> - add $16, %rdi
> - mov %rdi, %r9
> - sub %r8, %r9
> - sub %r9, %rdx
> - add %r9, %rsi
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - cmp %rcx, %rdx
> - jae L(gobble_mem_fwd)
> - lea L(shl_table_fwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - shl $1, %rcx
> - cmp %rcx, %rdx
> - ja L(gobble_mem_bwd)
> -
> - add %rdx, %rdi
> - add %rdx, %rsi
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $0xf, %r9
> - xor %r9, %rdi
> - sub %r9, %rsi
> - sub %r9, %rdx
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0_bwd)
> - lea L(shl_table_bwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(shl_0):
> -
> - mov %rdx, %r9
> - shr $8, %r9
> - add %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> - cmp $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> - jae L(gobble_mem_fwd)
> - sub $0x80, %rdx
> - .p2align 4
> -L(shl_0_loop):
> - movdqa (%rsi), %xmm1
> - movdqa %xmm1, (%rdi)
> - movaps 0x10(%rsi), %xmm2
> - movaps %xmm2, 0x10(%rdi)
> - movaps 0x20(%rsi), %xmm3
> - movaps %xmm3, 0x20(%rdi)
> - movaps 0x30(%rsi), %xmm4
> - movaps %xmm4, 0x30(%rdi)
> - movaps 0x40(%rsi), %xmm1
> - movaps %xmm1, 0x40(%rdi)
> - movaps 0x50(%rsi), %xmm2
> - movaps %xmm2, 0x50(%rdi)
> - movaps 0x60(%rsi), %xmm3
> - movaps %xmm3, 0x60(%rdi)
> - movaps 0x70(%rsi), %xmm4
> - movaps %xmm4, 0x70(%rdi)
> - sub $0x80, %rdx
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(shl_0_loop)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_bwd):
> - sub $0x80, %rdx
> -L(copy_backward_loop):
> - movaps -0x10(%rsi), %xmm1
> - movaps %xmm1, -0x10(%rdi)
> - movaps -0x20(%rsi), %xmm2
> - movaps %xmm2, -0x20(%rdi)
> - movaps -0x30(%rsi), %xmm3
> - movaps %xmm3, -0x30(%rdi)
> - movaps -0x40(%rsi), %xmm4
> - movaps %xmm4, -0x40(%rdi)
> - movaps -0x50(%rsi), %xmm5
> - movaps %xmm5, -0x50(%rdi)
> - movaps -0x60(%rsi), %xmm5
> - movaps %xmm5, -0x60(%rdi)
> - movaps -0x70(%rsi), %xmm5
> - movaps %xmm5, -0x70(%rdi)
> - movaps -0x80(%rsi), %xmm5
> - movaps %xmm5, -0x80(%rdi)
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(copy_backward_loop)
> -
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1):
> - sub $0x80, %rdx
> - movaps -0x01(%rsi), %xmm1
> - movaps 0x0f(%rsi), %xmm2
> - movaps 0x1f(%rsi), %xmm3
> - movaps 0x2f(%rsi), %xmm4
> - movaps 0x3f(%rsi), %xmm5
> - movaps 0x4f(%rsi), %xmm6
> - movaps 0x5f(%rsi), %xmm7
> - movaps 0x6f(%rsi), %xmm8
> - movaps 0x7f(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $1, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $1, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $1, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $1, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $1, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $1, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $1, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_1)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1_bwd):
> - movaps -0x01(%rsi), %xmm1
> -
> - movaps -0x11(%rsi), %xmm2
> - palignr $1, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x21(%rsi), %xmm3
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x31(%rsi), %xmm4
> - palignr $1, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x41(%rsi), %xmm5
> - palignr $1, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x51(%rsi), %xmm6
> - palignr $1, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x61(%rsi), %xmm7
> - palignr $1, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x71(%rsi), %xmm8
> - palignr $1, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x81(%rsi), %xmm9
> - palignr $1, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_1_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2):
> - sub $0x80, %rdx
> - movaps -0x02(%rsi), %xmm1
> - movaps 0x0e(%rsi), %xmm2
> - movaps 0x1e(%rsi), %xmm3
> - movaps 0x2e(%rsi), %xmm4
> - movaps 0x3e(%rsi), %xmm5
> - movaps 0x4e(%rsi), %xmm6
> - movaps 0x5e(%rsi), %xmm7
> - movaps 0x6e(%rsi), %xmm8
> - movaps 0x7e(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $2, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $2, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $2, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $2, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $2, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $2, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $2, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_2)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2_bwd):
> - movaps -0x02(%rsi), %xmm1
> -
> - movaps -0x12(%rsi), %xmm2
> - palignr $2, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x22(%rsi), %xmm3
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x32(%rsi), %xmm4
> - palignr $2, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x42(%rsi), %xmm5
> - palignr $2, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x52(%rsi), %xmm6
> - palignr $2, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x62(%rsi), %xmm7
> - palignr $2, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x72(%rsi), %xmm8
> - palignr $2, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x82(%rsi), %xmm9
> - palignr $2, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_2_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3):
> - sub $0x80, %rdx
> - movaps -0x03(%rsi), %xmm1
> - movaps 0x0d(%rsi), %xmm2
> - movaps 0x1d(%rsi), %xmm3
> - movaps 0x2d(%rsi), %xmm4
> - movaps 0x3d(%rsi), %xmm5
> - movaps 0x4d(%rsi), %xmm6
> - movaps 0x5d(%rsi), %xmm7
> - movaps 0x6d(%rsi), %xmm8
> - movaps 0x7d(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $3, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $3, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $3, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $3, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $3, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $3, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $3, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_3)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3_bwd):
> - movaps -0x03(%rsi), %xmm1
> -
> - movaps -0x13(%rsi), %xmm2
> - palignr $3, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x23(%rsi), %xmm3
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x33(%rsi), %xmm4
> - palignr $3, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x43(%rsi), %xmm5
> - palignr $3, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x53(%rsi), %xmm6
> - palignr $3, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x63(%rsi), %xmm7
> - palignr $3, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x73(%rsi), %xmm8
> - palignr $3, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x83(%rsi), %xmm9
> - palignr $3, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_3_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4):
> - sub $0x80, %rdx
> - movaps -0x04(%rsi), %xmm1
> - movaps 0x0c(%rsi), %xmm2
> - movaps 0x1c(%rsi), %xmm3
> - movaps 0x2c(%rsi), %xmm4
> - movaps 0x3c(%rsi), %xmm5
> - movaps 0x4c(%rsi), %xmm6
> - movaps 0x5c(%rsi), %xmm7
> - movaps 0x6c(%rsi), %xmm8
> - movaps 0x7c(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $4, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $4, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $4, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $4, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $4, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $4, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $4, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_4)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4_bwd):
> - movaps -0x04(%rsi), %xmm1
> -
> - movaps -0x14(%rsi), %xmm2
> - palignr $4, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x24(%rsi), %xmm3
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x34(%rsi), %xmm4
> - palignr $4, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x44(%rsi), %xmm5
> - palignr $4, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x54(%rsi), %xmm6
> - palignr $4, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x64(%rsi), %xmm7
> - palignr $4, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x74(%rsi), %xmm8
> - palignr $4, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x84(%rsi), %xmm9
> - palignr $4, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_4_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5):
> - sub $0x80, %rdx
> - movaps -0x05(%rsi), %xmm1
> - movaps 0x0b(%rsi), %xmm2
> - movaps 0x1b(%rsi), %xmm3
> - movaps 0x2b(%rsi), %xmm4
> - movaps 0x3b(%rsi), %xmm5
> - movaps 0x4b(%rsi), %xmm6
> - movaps 0x5b(%rsi), %xmm7
> - movaps 0x6b(%rsi), %xmm8
> - movaps 0x7b(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $5, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $5, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $5, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $5, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $5, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $5, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $5, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_5)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5_bwd):
> - movaps -0x05(%rsi), %xmm1
> -
> - movaps -0x15(%rsi), %xmm2
> - palignr $5, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x25(%rsi), %xmm3
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x35(%rsi), %xmm4
> - palignr $5, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x45(%rsi), %xmm5
> - palignr $5, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x55(%rsi), %xmm6
> - palignr $5, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x65(%rsi), %xmm7
> - palignr $5, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x75(%rsi), %xmm8
> - palignr $5, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x85(%rsi), %xmm9
> - palignr $5, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_5_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6):
> - sub $0x80, %rdx
> - movaps -0x06(%rsi), %xmm1
> - movaps 0x0a(%rsi), %xmm2
> - movaps 0x1a(%rsi), %xmm3
> - movaps 0x2a(%rsi), %xmm4
> - movaps 0x3a(%rsi), %xmm5
> - movaps 0x4a(%rsi), %xmm6
> - movaps 0x5a(%rsi), %xmm7
> - movaps 0x6a(%rsi), %xmm8
> - movaps 0x7a(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $6, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $6, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $6, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $6, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $6, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $6, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $6, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_6)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6_bwd):
> - movaps -0x06(%rsi), %xmm1
> -
> - movaps -0x16(%rsi), %xmm2
> - palignr $6, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x26(%rsi), %xmm3
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x36(%rsi), %xmm4
> - palignr $6, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x46(%rsi), %xmm5
> - palignr $6, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x56(%rsi), %xmm6
> - palignr $6, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x66(%rsi), %xmm7
> - palignr $6, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x76(%rsi), %xmm8
> - palignr $6, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x86(%rsi), %xmm9
> - palignr $6, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_6_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7):
> - sub $0x80, %rdx
> - movaps -0x07(%rsi), %xmm1
> - movaps 0x09(%rsi), %xmm2
> - movaps 0x19(%rsi), %xmm3
> - movaps 0x29(%rsi), %xmm4
> - movaps 0x39(%rsi), %xmm5
> - movaps 0x49(%rsi), %xmm6
> - movaps 0x59(%rsi), %xmm7
> - movaps 0x69(%rsi), %xmm8
> - movaps 0x79(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $7, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $7, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $7, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $7, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $7, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $7, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $7, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_7)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7_bwd):
> - movaps -0x07(%rsi), %xmm1
> -
> - movaps -0x17(%rsi), %xmm2
> - palignr $7, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x27(%rsi), %xmm3
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x37(%rsi), %xmm4
> - palignr $7, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x47(%rsi), %xmm5
> - palignr $7, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x57(%rsi), %xmm6
> - palignr $7, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x67(%rsi), %xmm7
> - palignr $7, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x77(%rsi), %xmm8
> - palignr $7, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x87(%rsi), %xmm9
> - palignr $7, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_7_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8):
> - sub $0x80, %rdx
> - movaps -0x08(%rsi), %xmm1
> - movaps 0x08(%rsi), %xmm2
> - movaps 0x18(%rsi), %xmm3
> - movaps 0x28(%rsi), %xmm4
> - movaps 0x38(%rsi), %xmm5
> - movaps 0x48(%rsi), %xmm6
> - movaps 0x58(%rsi), %xmm7
> - movaps 0x68(%rsi), %xmm8
> - movaps 0x78(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $8, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $8, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $8, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $8, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $8, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $8, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $8, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_8)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8_bwd):
> - movaps -0x08(%rsi), %xmm1
> -
> - movaps -0x18(%rsi), %xmm2
> - palignr $8, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x28(%rsi), %xmm3
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x38(%rsi), %xmm4
> - palignr $8, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x48(%rsi), %xmm5
> - palignr $8, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x58(%rsi), %xmm6
> - palignr $8, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x68(%rsi), %xmm7
> - palignr $8, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x78(%rsi), %xmm8
> - palignr $8, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x88(%rsi), %xmm9
> - palignr $8, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_8_bwd)
> -L(shl_8_end_bwd):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9):
> - sub $0x80, %rdx
> - movaps -0x09(%rsi), %xmm1
> - movaps 0x07(%rsi), %xmm2
> - movaps 0x17(%rsi), %xmm3
> - movaps 0x27(%rsi), %xmm4
> - movaps 0x37(%rsi), %xmm5
> - movaps 0x47(%rsi), %xmm6
> - movaps 0x57(%rsi), %xmm7
> - movaps 0x67(%rsi), %xmm8
> - movaps 0x77(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $9, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $9, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $9, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $9, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $9, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $9, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $9, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_9)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9_bwd):
> - movaps -0x09(%rsi), %xmm1
> -
> - movaps -0x19(%rsi), %xmm2
> - palignr $9, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x29(%rsi), %xmm3
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x39(%rsi), %xmm4
> - palignr $9, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x49(%rsi), %xmm5
> - palignr $9, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x59(%rsi), %xmm6
> - palignr $9, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x69(%rsi), %xmm7
> - palignr $9, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x79(%rsi), %xmm8
> - palignr $9, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x89(%rsi), %xmm9
> - palignr $9, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_9_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10):
> - sub $0x80, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - movaps 0x06(%rsi), %xmm2
> - movaps 0x16(%rsi), %xmm3
> - movaps 0x26(%rsi), %xmm4
> - movaps 0x36(%rsi), %xmm5
> - movaps 0x46(%rsi), %xmm6
> - movaps 0x56(%rsi), %xmm7
> - movaps 0x66(%rsi), %xmm8
> - movaps 0x76(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $10, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $10, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $10, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $10, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $10, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $10, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $10, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_10)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10_bwd):
> - movaps -0x0a(%rsi), %xmm1
> -
> - movaps -0x1a(%rsi), %xmm2
> - palignr $10, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2a(%rsi), %xmm3
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3a(%rsi), %xmm4
> - palignr $10, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4a(%rsi), %xmm5
> - palignr $10, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5a(%rsi), %xmm6
> - palignr $10, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6a(%rsi), %xmm7
> - palignr $10, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7a(%rsi), %xmm8
> - palignr $10, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8a(%rsi), %xmm9
> - palignr $10, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_10_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11):
> - sub $0x80, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - movaps 0x05(%rsi), %xmm2
> - movaps 0x15(%rsi), %xmm3
> - movaps 0x25(%rsi), %xmm4
> - movaps 0x35(%rsi), %xmm5
> - movaps 0x45(%rsi), %xmm6
> - movaps 0x55(%rsi), %xmm7
> - movaps 0x65(%rsi), %xmm8
> - movaps 0x75(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $11, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $11, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $11, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $11, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $11, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $11, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $11, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_11)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11_bwd):
> - movaps -0x0b(%rsi), %xmm1
> -
> - movaps -0x1b(%rsi), %xmm2
> - palignr $11, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2b(%rsi), %xmm3
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3b(%rsi), %xmm4
> - palignr $11, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4b(%rsi), %xmm5
> - palignr $11, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5b(%rsi), %xmm6
> - palignr $11, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6b(%rsi), %xmm7
> - palignr $11, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7b(%rsi), %xmm8
> - palignr $11, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8b(%rsi), %xmm9
> - palignr $11, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_11_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12):
> - sub $0x80, %rdx
> - movdqa -0x0c(%rsi), %xmm1
> - movaps 0x04(%rsi), %xmm2
> - movaps 0x14(%rsi), %xmm3
> - movaps 0x24(%rsi), %xmm4
> - movaps 0x34(%rsi), %xmm5
> - movaps 0x44(%rsi), %xmm6
> - movaps 0x54(%rsi), %xmm7
> - movaps 0x64(%rsi), %xmm8
> - movaps 0x74(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $12, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $12, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $12, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $12, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $12, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $12, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $12, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> -
> - lea 0x80(%rdi), %rdi
> - jae L(shl_12)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12_bwd):
> - movaps -0x0c(%rsi), %xmm1
> -
> - movaps -0x1c(%rsi), %xmm2
> - palignr $12, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2c(%rsi), %xmm3
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3c(%rsi), %xmm4
> - palignr $12, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4c(%rsi), %xmm5
> - palignr $12, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5c(%rsi), %xmm6
> - palignr $12, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6c(%rsi), %xmm7
> - palignr $12, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7c(%rsi), %xmm8
> - palignr $12, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8c(%rsi), %xmm9
> - palignr $12, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_12_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13):
> - sub $0x80, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - movaps 0x03(%rsi), %xmm2
> - movaps 0x13(%rsi), %xmm3
> - movaps 0x23(%rsi), %xmm4
> - movaps 0x33(%rsi), %xmm5
> - movaps 0x43(%rsi), %xmm6
> - movaps 0x53(%rsi), %xmm7
> - movaps 0x63(%rsi), %xmm8
> - movaps 0x73(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $13, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $13, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $13, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $13, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $13, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $13, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $13, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_13)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13_bwd):
> - movaps -0x0d(%rsi), %xmm1
> -
> - movaps -0x1d(%rsi), %xmm2
> - palignr $13, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2d(%rsi), %xmm3
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3d(%rsi), %xmm4
> - palignr $13, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4d(%rsi), %xmm5
> - palignr $13, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5d(%rsi), %xmm6
> - palignr $13, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6d(%rsi), %xmm7
> - palignr $13, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7d(%rsi), %xmm8
> - palignr $13, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8d(%rsi), %xmm9
> - palignr $13, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_13_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14):
> - sub $0x80, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - movaps 0x02(%rsi), %xmm2
> - movaps 0x12(%rsi), %xmm3
> - movaps 0x22(%rsi), %xmm4
> - movaps 0x32(%rsi), %xmm5
> - movaps 0x42(%rsi), %xmm6
> - movaps 0x52(%rsi), %xmm7
> - movaps 0x62(%rsi), %xmm8
> - movaps 0x72(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $14, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $14, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $14, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $14, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $14, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $14, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $14, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_14)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14_bwd):
> - movaps -0x0e(%rsi), %xmm1
> -
> - movaps -0x1e(%rsi), %xmm2
> - palignr $14, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2e(%rsi), %xmm3
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3e(%rsi), %xmm4
> - palignr $14, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4e(%rsi), %xmm5
> - palignr $14, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5e(%rsi), %xmm6
> - palignr $14, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6e(%rsi), %xmm7
> - palignr $14, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7e(%rsi), %xmm8
> - palignr $14, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8e(%rsi), %xmm9
> - palignr $14, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_14_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15):
> - sub $0x80, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - movaps 0x01(%rsi), %xmm2
> - movaps 0x11(%rsi), %xmm3
> - movaps 0x21(%rsi), %xmm4
> - movaps 0x31(%rsi), %xmm5
> - movaps 0x41(%rsi), %xmm6
> - movaps 0x51(%rsi), %xmm7
> - movaps 0x61(%rsi), %xmm8
> - movaps 0x71(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $15, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $15, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $15, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $15, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $15, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $15, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $15, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_15)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15_bwd):
> - movaps -0x0f(%rsi), %xmm1
> -
> - movaps -0x1f(%rsi), %xmm2
> - palignr $15, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2f(%rsi), %xmm3
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3f(%rsi), %xmm4
> - palignr $15, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4f(%rsi), %xmm5
> - palignr $15, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5f(%rsi), %xmm6
> - palignr $15, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6f(%rsi), %xmm7
> - palignr $15, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7f(%rsi), %xmm8
> - palignr $15, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8f(%rsi), %xmm9
> - palignr $15, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_15_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_fwd):
> - movdqu (%rsi), %xmm1
> - movdqu %xmm0, (%r8)
> - movdqa %xmm1, (%rdi)
> - sub $16, %rdx
> - add $16, %rsi
> - add $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rsi, %r9
> - sub %rdi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_fwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger_in_fwd)
> - mov %rdx, %rcx
> -L(bigger_in_fwd):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy_fwd)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy_fwd)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy_fwd):
> - sub $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 0x200(%rsi)
> - prefetcht0 0x300(%rsi)
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lfence
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - movntdq %xmm4, 0x40(%rdi)
> - movntdq %xmm5, 0x50(%rdi)
> - movntdq %xmm6, 0x60(%rdi)
> - movntdq %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_mem_fwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_fwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy_fwd):
> - add %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> - prefetchnta 0x1c0(%rsi)
> - prefetchnta 0x280(%rsi)
> - prefetchnta 0x1c0(%rdi)
> - prefetchnta 0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - movdqa %xmm2, 0x20(%rdi)
> - movdqa %xmm3, 0x30(%rdi)
> - movdqa %xmm4, 0x40(%rdi)
> - movdqa %xmm5, 0x50(%rdi)
> - movdqa %xmm6, 0x60(%rdi)
> - movdqa %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_bwd):
> - add %rdx, %rsi
> - add %rdx, %rdi
> -
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $-16, %rdi
> - sub %rdi, %r9
> - sub %r9, %rsi
> - sub %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rdi, %r9
> - sub %rsi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_bwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger)
> - mov %rdx, %rcx
> -L(bigger):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy):
> - sub $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 -0x200(%rsi)
> - prefetcht0 -0x300(%rsi)
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - lfence
> - movntdq %xmm1, -0x10(%rdi)
> - movntdq %xmm2, -0x20(%rdi)
> - movntdq %xmm3, -0x30(%rdi)
> - movntdq %xmm4, -0x40(%rdi)
> - movntdq %xmm5, -0x50(%rdi)
> - movntdq %xmm6, -0x60(%rdi)
> - movntdq %xmm7, -0x70(%rdi)
> - movntdq %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_mem_bwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_bwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy):
> - add %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop):
> - prefetchnta -0x1c0(%rsi)
> - prefetchnta -0x280(%rsi)
> - prefetchnta -0x1c0(%rdi)
> - prefetchnta -0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - movdqa %xmm1, -0x10(%rdi)
> - movdqa %xmm2, -0x20(%rdi)
> - movdqa %xmm3, -0x30(%rdi)
> - movdqa %xmm4, -0x40(%rdi)
> - movdqa %xmm5, -0x50(%rdi)
> - movdqa %xmm6, -0x60(%rdi)
> - movdqa %xmm7, -0x70(%rdi)
> - movdqa %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rsi
> - sub %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(fwd_write_128bytes):
> - lddqu -128(%rsi), %xmm0
> - movdqu %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> - lddqu -112(%rsi), %xmm0
> - movdqu %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> - lddqu -96(%rsi), %xmm0
> - movdqu %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> - lddqu -80(%rsi), %xmm0
> - movdqu %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> - lddqu -64(%rsi), %xmm0
> - movdqu %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> - lddqu -48(%rsi), %xmm0
> - movdqu %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> - lddqu -32(%rsi), %xmm0
> - movdqu %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> - lddqu -16(%rsi), %xmm0
> - movdqu %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> - ret
> -
> -
> - .p2align 4
> -L(fwd_write_143bytes):
> - lddqu -143(%rsi), %xmm0
> - movdqu %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> - lddqu -127(%rsi), %xmm0
> - movdqu %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> - lddqu -111(%rsi), %xmm0
> - movdqu %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> - lddqu -95(%rsi), %xmm0
> - movdqu %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> - lddqu -79(%rsi), %xmm0
> - movdqu %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> - lddqu -63(%rsi), %xmm0
> - movdqu %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> - lddqu -47(%rsi), %xmm0
> - movdqu %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> - lddqu -31(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -31(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_15bytes):
> - mov -15(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -15(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_142bytes):
> - lddqu -142(%rsi), %xmm0
> - movdqu %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> - lddqu -126(%rsi), %xmm0
> - movdqu %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> - lddqu -110(%rsi), %xmm0
> - movdqu %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> - lddqu -94(%rsi), %xmm0
> - movdqu %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> - lddqu -78(%rsi), %xmm0
> - movdqu %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> - lddqu -62(%rsi), %xmm0
> - movdqu %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> - lddqu -46(%rsi), %xmm0
> - movdqu %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> - lddqu -30(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -30(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_14bytes):
> - mov -14(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -14(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_141bytes):
> - lddqu -141(%rsi), %xmm0
> - movdqu %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> - lddqu -125(%rsi), %xmm0
> - movdqu %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> - lddqu -109(%rsi), %xmm0
> - movdqu %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> - lddqu -93(%rsi), %xmm0
> - movdqu %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> - lddqu -77(%rsi), %xmm0
> - movdqu %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> - lddqu -61(%rsi), %xmm0
> - movdqu %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> - lddqu -45(%rsi), %xmm0
> - movdqu %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> - lddqu -29(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -29(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_13bytes):
> - mov -13(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -13(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_140bytes):
> - lddqu -140(%rsi), %xmm0
> - movdqu %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> - lddqu -124(%rsi), %xmm0
> - movdqu %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> - lddqu -108(%rsi), %xmm0
> - movdqu %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> - lddqu -92(%rsi), %xmm0
> - movdqu %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> - lddqu -76(%rsi), %xmm0
> - movdqu %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> - lddqu -60(%rsi), %xmm0
> - movdqu %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> - lddqu -44(%rsi), %xmm0
> - movdqu %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> - lddqu -28(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -28(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_12bytes):
> - mov -12(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -12(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_139bytes):
> - lddqu -139(%rsi), %xmm0
> - movdqu %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> - lddqu -123(%rsi), %xmm0
> - movdqu %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> - lddqu -107(%rsi), %xmm0
> - movdqu %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> - lddqu -91(%rsi), %xmm0
> - movdqu %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> - lddqu -75(%rsi), %xmm0
> - movdqu %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> - lddqu -59(%rsi), %xmm0
> - movdqu %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> - lddqu -43(%rsi), %xmm0
> - movdqu %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> - lddqu -27(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -27(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_11bytes):
> - mov -11(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -11(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_138bytes):
> - lddqu -138(%rsi), %xmm0
> - movdqu %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> - lddqu -122(%rsi), %xmm0
> - movdqu %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> - lddqu -106(%rsi), %xmm0
> - movdqu %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> - lddqu -90(%rsi), %xmm0
> - movdqu %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> - lddqu -74(%rsi), %xmm0
> - movdqu %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> - lddqu -58(%rsi), %xmm0
> - movdqu %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> - lddqu -42(%rsi), %xmm0
> - movdqu %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> - lddqu -26(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -26(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_10bytes):
> - mov -10(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -10(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_137bytes):
> - lddqu -137(%rsi), %xmm0
> - movdqu %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> - lddqu -121(%rsi), %xmm0
> - movdqu %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> - lddqu -105(%rsi), %xmm0
> - movdqu %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> - lddqu -89(%rsi), %xmm0
> - movdqu %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> - lddqu -73(%rsi), %xmm0
> - movdqu %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> - lddqu -57(%rsi), %xmm0
> - movdqu %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> - lddqu -41(%rsi), %xmm0
> - movdqu %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> - lddqu -25(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -25(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_9bytes):
> - mov -9(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -9(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_136bytes):
> - lddqu -136(%rsi), %xmm0
> - movdqu %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> - lddqu -120(%rsi), %xmm0
> - movdqu %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> - lddqu -104(%rsi), %xmm0
> - movdqu %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> - lddqu -88(%rsi), %xmm0
> - movdqu %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> - lddqu -72(%rsi), %xmm0
> - movdqu %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> - lddqu -56(%rsi), %xmm0
> - movdqu %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> - lddqu -40(%rsi), %xmm0
> - movdqu %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> - lddqu -24(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -24(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_8bytes):
> - mov -8(%rsi), %rdx
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_135bytes):
> - lddqu -135(%rsi), %xmm0
> - movdqu %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> - lddqu -119(%rsi), %xmm0
> - movdqu %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> - lddqu -103(%rsi), %xmm0
> - movdqu %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> - lddqu -87(%rsi), %xmm0
> - movdqu %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> - lddqu -71(%rsi), %xmm0
> - movdqu %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> - lddqu -55(%rsi), %xmm0
> - movdqu %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> - lddqu -39(%rsi), %xmm0
> - movdqu %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> - lddqu -23(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -23(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_7bytes):
> - mov -7(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -7(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_134bytes):
> - lddqu -134(%rsi), %xmm0
> - movdqu %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> - lddqu -118(%rsi), %xmm0
> - movdqu %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> - lddqu -102(%rsi), %xmm0
> - movdqu %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> - lddqu -86(%rsi), %xmm0
> - movdqu %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> - lddqu -70(%rsi), %xmm0
> - movdqu %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> - lddqu -54(%rsi), %xmm0
> - movdqu %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> - lddqu -38(%rsi), %xmm0
> - movdqu %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> - lddqu -22(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -22(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_6bytes):
> - mov -6(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -6(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_133bytes):
> - lddqu -133(%rsi), %xmm0
> - movdqu %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> - lddqu -117(%rsi), %xmm0
> - movdqu %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> - lddqu -101(%rsi), %xmm0
> - movdqu %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> - lddqu -85(%rsi), %xmm0
> - movdqu %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> - lddqu -69(%rsi), %xmm0
> - movdqu %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> - lddqu -53(%rsi), %xmm0
> - movdqu %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> - lddqu -37(%rsi), %xmm0
> - movdqu %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> - lddqu -21(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -21(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_5bytes):
> - mov -5(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -5(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_132bytes):
> - lddqu -132(%rsi), %xmm0
> - movdqu %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> - lddqu -116(%rsi), %xmm0
> - movdqu %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> - lddqu -100(%rsi), %xmm0
> - movdqu %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> - lddqu -84(%rsi), %xmm0
> - movdqu %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> - lddqu -68(%rsi), %xmm0
> - movdqu %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> - lddqu -52(%rsi), %xmm0
> - movdqu %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> - lddqu -36(%rsi), %xmm0
> - movdqu %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> - lddqu -20(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -20(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_4bytes):
> - mov -4(%rsi), %edx
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_131bytes):
> - lddqu -131(%rsi), %xmm0
> - movdqu %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> - lddqu -115(%rsi), %xmm0
> - movdqu %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> - lddqu -99(%rsi), %xmm0
> - movdqu %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> - lddqu -83(%rsi), %xmm0
> - movdqu %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> - lddqu -67(%rsi), %xmm0
> - movdqu %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> - lddqu -51(%rsi), %xmm0
> - movdqu %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> - lddqu -35(%rsi), %xmm0
> - movdqu %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> - lddqu -19(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -19(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_3bytes):
> - mov -3(%rsi), %dx
> - mov -2(%rsi), %cx
> - mov %dx, -3(%rdi)
> - mov %cx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_130bytes):
> - lddqu -130(%rsi), %xmm0
> - movdqu %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> - lddqu -114(%rsi), %xmm0
> - movdqu %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> - lddqu -98(%rsi), %xmm0
> - movdqu %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> - lddqu -82(%rsi), %xmm0
> - movdqu %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> - lddqu -66(%rsi), %xmm0
> - movdqu %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> - lddqu -50(%rsi), %xmm0
> - movdqu %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> - lddqu -34(%rsi), %xmm0
> - movdqu %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> - lddqu -18(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -18(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_2bytes):
> - movzwl -2(%rsi), %edx
> - mov %dx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_129bytes):
> - lddqu -129(%rsi), %xmm0
> - movdqu %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> - lddqu -113(%rsi), %xmm0
> - movdqu %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> - lddqu -97(%rsi), %xmm0
> - movdqu %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> - lddqu -81(%rsi), %xmm0
> - movdqu %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> - lddqu -65(%rsi), %xmm0
> - movdqu %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> - lddqu -49(%rsi), %xmm0
> - movdqu %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> - lddqu -33(%rsi), %xmm0
> - movdqu %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> - lddqu -17(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -17(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_1bytes):
> - movzbl -1(%rsi), %edx
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_128bytes):
> - lddqu 112(%rsi), %xmm0
> - movdqu %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> - lddqu 96(%rsi), %xmm0
> - movdqu %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> - lddqu 80(%rsi), %xmm0
> - movdqu %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> - lddqu 64(%rsi), %xmm0
> - movdqu %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> - lddqu 48(%rsi), %xmm0
> - movdqu %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> - lddqu 32(%rsi), %xmm0
> - movdqu %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> - lddqu 16(%rsi), %xmm0
> - movdqu %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> - lddqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> - ret
> -
> - .p2align 4
> -L(bwd_write_143bytes):
> - lddqu 127(%rsi), %xmm0
> - movdqu %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> - lddqu 111(%rsi), %xmm0
> - movdqu %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> - lddqu 95(%rsi), %xmm0
> - movdqu %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> - lddqu 79(%rsi), %xmm0
> - movdqu %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> - lddqu 63(%rsi), %xmm0
> - movdqu %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> - lddqu 47(%rsi), %xmm0
> - movdqu %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> - lddqu 31(%rsi), %xmm0
> - movdqu %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> - lddqu 15(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 15(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> -
> - .p2align 4
> -L(bwd_write_15bytes):
> - mov 7(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 7(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_142bytes):
> - lddqu 126(%rsi), %xmm0
> - movdqu %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> - lddqu 110(%rsi), %xmm0
> - movdqu %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> - lddqu 94(%rsi), %xmm0
> - movdqu %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> - lddqu 78(%rsi), %xmm0
> - movdqu %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> - lddqu 62(%rsi), %xmm0
> - movdqu %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> - lddqu 46(%rsi), %xmm0
> - movdqu %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> - lddqu 30(%rsi), %xmm0
> - movdqu %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> - lddqu 14(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 14(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_14bytes):
> - mov 6(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 6(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_141bytes):
> - lddqu 125(%rsi), %xmm0
> - movdqu %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> - lddqu 109(%rsi), %xmm0
> - movdqu %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> - lddqu 93(%rsi), %xmm0
> - movdqu %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> - lddqu 77(%rsi), %xmm0
> - movdqu %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> - lddqu 61(%rsi), %xmm0
> - movdqu %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> - lddqu 45(%rsi), %xmm0
> - movdqu %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> - lddqu 29(%rsi), %xmm0
> - movdqu %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> - lddqu 13(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 13(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_13bytes):
> - mov 5(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 5(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_140bytes):
> - lddqu 124(%rsi), %xmm0
> - movdqu %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> - lddqu 108(%rsi), %xmm0
> - movdqu %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> - lddqu 92(%rsi), %xmm0
> - movdqu %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> - lddqu 76(%rsi), %xmm0
> - movdqu %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> - lddqu 60(%rsi), %xmm0
> - movdqu %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> - lddqu 44(%rsi), %xmm0
> - movdqu %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> - lddqu 28(%rsi), %xmm0
> - movdqu %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> - lddqu 12(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 12(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_12bytes):
> - mov 4(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 4(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_139bytes):
> - lddqu 123(%rsi), %xmm0
> - movdqu %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> - lddqu 107(%rsi), %xmm0
> - movdqu %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> - lddqu 91(%rsi), %xmm0
> - movdqu %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> - lddqu 75(%rsi), %xmm0
> - movdqu %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> - lddqu 59(%rsi), %xmm0
> - movdqu %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> - lddqu 43(%rsi), %xmm0
> - movdqu %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> - lddqu 27(%rsi), %xmm0
> - movdqu %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> - lddqu 11(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 11(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_11bytes):
> - mov 3(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 3(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_138bytes):
> - lddqu 122(%rsi), %xmm0
> - movdqu %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> - lddqu 106(%rsi), %xmm0
> - movdqu %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> - lddqu 90(%rsi), %xmm0
> - movdqu %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> - lddqu 74(%rsi), %xmm0
> - movdqu %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> - lddqu 58(%rsi), %xmm0
> - movdqu %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> - lddqu 42(%rsi), %xmm0
> - movdqu %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> - lddqu 26(%rsi), %xmm0
> - movdqu %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> - lddqu 10(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 10(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_10bytes):
> - mov 2(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 2(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_137bytes):
> - lddqu 121(%rsi), %xmm0
> - movdqu %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> - lddqu 105(%rsi), %xmm0
> - movdqu %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> - lddqu 89(%rsi), %xmm0
> - movdqu %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> - lddqu 73(%rsi), %xmm0
> - movdqu %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> - lddqu 57(%rsi), %xmm0
> - movdqu %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> - lddqu 41(%rsi), %xmm0
> - movdqu %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> - lddqu 25(%rsi), %xmm0
> - movdqu %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> - lddqu 9(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 9(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_9bytes):
> - mov 1(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 1(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_136bytes):
> - lddqu 120(%rsi), %xmm0
> - movdqu %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> - lddqu 104(%rsi), %xmm0
> - movdqu %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> - lddqu 88(%rsi), %xmm0
> - movdqu %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> - lddqu 72(%rsi), %xmm0
> - movdqu %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> - lddqu 56(%rsi), %xmm0
> - movdqu %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> - lddqu 40(%rsi), %xmm0
> - movdqu %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> - lddqu 24(%rsi), %xmm0
> - movdqu %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> - lddqu 8(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 8(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_8bytes):
> - mov (%rsi), %rdx
> - mov %rdx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_135bytes):
> - lddqu 119(%rsi), %xmm0
> - movdqu %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> - lddqu 103(%rsi), %xmm0
> - movdqu %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> - lddqu 87(%rsi), %xmm0
> - movdqu %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> - lddqu 71(%rsi), %xmm0
> - movdqu %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> - lddqu 55(%rsi), %xmm0
> - movdqu %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> - lddqu 39(%rsi), %xmm0
> - movdqu %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> - lddqu 23(%rsi), %xmm0
> - movdqu %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> - lddqu 7(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 7(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_7bytes):
> - mov 3(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 3(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_134bytes):
> - lddqu 118(%rsi), %xmm0
> - movdqu %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> - lddqu 102(%rsi), %xmm0
> - movdqu %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> - lddqu 86(%rsi), %xmm0
> - movdqu %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> - lddqu 70(%rsi), %xmm0
> - movdqu %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> - lddqu 54(%rsi), %xmm0
> - movdqu %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> - lddqu 38(%rsi), %xmm0
> - movdqu %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> - lddqu 22(%rsi), %xmm0
> - movdqu %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> - lddqu 6(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 6(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_6bytes):
> - mov 2(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 2(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_133bytes):
> - lddqu 117(%rsi), %xmm0
> - movdqu %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> - lddqu 101(%rsi), %xmm0
> - movdqu %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> - lddqu 85(%rsi), %xmm0
> - movdqu %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> - lddqu 69(%rsi), %xmm0
> - movdqu %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> - lddqu 53(%rsi), %xmm0
> - movdqu %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> - lddqu 37(%rsi), %xmm0
> - movdqu %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> - lddqu 21(%rsi), %xmm0
> - movdqu %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> - lddqu 5(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 5(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_5bytes):
> - mov 1(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 1(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_132bytes):
> - lddqu 116(%rsi), %xmm0
> - movdqu %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> - lddqu 100(%rsi), %xmm0
> - movdqu %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> - lddqu 84(%rsi), %xmm0
> - movdqu %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> - lddqu 68(%rsi), %xmm0
> - movdqu %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> - lddqu 52(%rsi), %xmm0
> - movdqu %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> - lddqu 36(%rsi), %xmm0
> - movdqu %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> - lddqu 20(%rsi), %xmm0
> - movdqu %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> - lddqu 4(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 4(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_4bytes):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_131bytes):
> - lddqu 115(%rsi), %xmm0
> - movdqu %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> - lddqu 99(%rsi), %xmm0
> - movdqu %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> - lddqu 83(%rsi), %xmm0
> - movdqu %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> - lddqu 67(%rsi), %xmm0
> - movdqu %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> - lddqu 51(%rsi), %xmm0
> - movdqu %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> - lddqu 35(%rsi), %xmm0
> - movdqu %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> - lddqu 19(%rsi), %xmm0
> - movdqu %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> - lddqu 3(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 3(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_3bytes):
> - mov 1(%rsi), %dx
> - mov (%rsi), %cx
> - mov %dx, 1(%rdi)
> - mov %cx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_130bytes):
> - lddqu 114(%rsi), %xmm0
> - movdqu %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> - lddqu 98(%rsi), %xmm0
> - movdqu %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> - lddqu 82(%rsi), %xmm0
> - movdqu %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> - lddqu 66(%rsi), %xmm0
> - movdqu %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> - lddqu 50(%rsi), %xmm0
> - movdqu %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> - lddqu 34(%rsi), %xmm0
> - movdqu %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> - lddqu 18(%rsi), %xmm0
> - movdqu %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> - lddqu 2(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 2(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_2bytes):
> - movzwl (%rsi), %edx
> - mov %dx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_129bytes):
> - lddqu 113(%rsi), %xmm0
> - movdqu %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> - lddqu 97(%rsi), %xmm0
> - movdqu %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> - lddqu 81(%rsi), %xmm0
> - movdqu %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> - lddqu 65(%rsi), %xmm0
> - movdqu %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> - lddqu 49(%rsi), %xmm0
> - movdqu %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> - lddqu 33(%rsi), %xmm0
> - movdqu %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> - lddqu 17(%rsi), %xmm0
> - movdqu %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> - lddqu 1(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 1(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_1bytes):
> - movzbl (%rsi), %edx
> - mov %dl, (%rdi)
> - ret
> -
> -END (MEMCPY)
> -
> - .section .rodata.ssse3,"a",@progbits
> - .p2align 3
> -L(table_144_bytes_bwd):
> - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> - .p2align 3
> -L(table_144_bytes_fwd):
> - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> - .p2align 3
> -L(shl_table_fwd):
> - .int JMPTBL (L(shl_0), L(shl_table_fwd))
> - .int JMPTBL (L(shl_1), L(shl_table_fwd))
> - .int JMPTBL (L(shl_2), L(shl_table_fwd))
> - .int JMPTBL (L(shl_3), L(shl_table_fwd))
> - .int JMPTBL (L(shl_4), L(shl_table_fwd))
> - .int JMPTBL (L(shl_5), L(shl_table_fwd))
> - .int JMPTBL (L(shl_6), L(shl_table_fwd))
> - .int JMPTBL (L(shl_7), L(shl_table_fwd))
> - .int JMPTBL (L(shl_8), L(shl_table_fwd))
> - .int JMPTBL (L(shl_9), L(shl_table_fwd))
> - .int JMPTBL (L(shl_10), L(shl_table_fwd))
> - .int JMPTBL (L(shl_11), L(shl_table_fwd))
> - .int JMPTBL (L(shl_12), L(shl_table_fwd))
> - .int JMPTBL (L(shl_13), L(shl_table_fwd))
> - .int JMPTBL (L(shl_14), L(shl_table_fwd))
> - .int JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> - .p2align 3
> -L(shl_table_bwd):
> - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY __memmove_ssse3_back
> -#define MEMCPY_CHK __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 5/6] x86: Remove str{n}cat-ssse3
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 19:57 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
> sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
> sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
> sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
> 5 files changed, 879 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 323be3b969..a2ebc06c5f 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -59,7 +59,6 @@ sysdep_routines += \
> strcat-evex \
> strcat-sse2 \
> strcat-sse2-unaligned \
> - strcat-ssse3 \
> strchr-avx2 \
> strchr-avx2-rtm \
> strchr-evex \
> @@ -97,7 +96,6 @@ sysdep_routines += \
> strncat-c \
> strncat-evex \
> strncat-sse2-unaligned \
> - strncat-ssse3 \
> strncmp-avx2 \
> strncmp-avx2-rtm \
> strncmp-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index d6852ab365..4133ed7e43 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcat_evex)
> - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
> - __strcat_ssse3)
> IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
>
> @@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncat_evex)
> - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
> - __strncat_ssse3)
> IFUNC_IMPL_ADD (array, i, strncat, 1,
> __strncat_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 5bece38f78..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -23,7 +23,6 @@
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> return OPTIMIZE (sse2_unaligned);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index 9f39e4fcd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,866 +0,0 @@
> -/* strcat with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -# define STRCAT __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> - mov %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> - implementation gets merged. */
> -
> - xor %eax, %eax
> - cmpb $0, (%rdi)
> - jz L(exit_tail0)
> - cmpb $0, 1(%rdi)
> - jz L(exit_tail1)
> - cmpb $0, 2(%rdi)
> - jz L(exit_tail2)
> - cmpb $0, 3(%rdi)
> - jz L(exit_tail3)
> -
> - cmpb $0, 4(%rdi)
> - jz L(exit_tail4)
> - cmpb $0, 5(%rdi)
> - jz L(exit_tail5)
> - cmpb $0, 6(%rdi)
> - jz L(exit_tail6)
> - cmpb $0, 7(%rdi)
> - jz L(exit_tail7)
> -
> - cmpb $0, 8(%rdi)
> - jz L(exit_tail8)
> - cmpb $0, 9(%rdi)
> - jz L(exit_tail9)
> - cmpb $0, 10(%rdi)
> - jz L(exit_tail10)
> - cmpb $0, 11(%rdi)
> - jz L(exit_tail11)
> -
> - cmpb $0, 12(%rdi)
> - jz L(exit_tail12)
> - cmpb $0, 13(%rdi)
> - jz L(exit_tail13)
> - cmpb $0, 14(%rdi)
> - jz L(exit_tail14)
> - cmpb $0, 15(%rdi)
> - jz L(exit_tail15)
> - pxor %xmm0, %xmm0
> - lea 16(%rdi), %rcx
> - lea 16(%rdi), %rax
> - and $-16, %rax
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - pxor %xmm1, %xmm1
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - pxor %xmm2, %xmm2
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - pxor %xmm3, %xmm3
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - and $-0x40, %rax
> -
> - .p2align 4
> -L(aligned_64):
> - pcmpeqb (%rax), %xmm0
> - pcmpeqb 16(%rax), %xmm1
> - pcmpeqb 32(%rax), %xmm2
> - pcmpeqb 48(%rax), %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm1, %r11d
> - pmovmskb %xmm2, %r10d
> - pmovmskb %xmm3, %r9d
> - or %edx, %r9d
> - or %r11d, %r9d
> - or %r10d, %r9d
> - lea 64(%rax), %rax
> - jz L(aligned_64)
> -
> - test %edx, %edx
> - jnz L(aligned_64_exit_16)
> - test %r11d, %r11d
> - jnz L(aligned_64_exit_32)
> - test %r10d, %r10d
> - jnz L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> - pmovmskb %xmm3, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_48):
> - lea -16(%rax), %rax
> - mov %r10d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_32):
> - lea -32(%rax), %rax
> - mov %r11d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_16):
> - lea -48(%rax), %rax
> -
> -L(exit):
> - sub %rcx, %rax
> - test %dl, %dl
> - jz L(exit_high)
> - test $0x01, %dl
> - jnz L(exit_tail0)
> -
> - test $0x02, %dl
> - jnz L(exit_tail1)
> -
> - test $0x04, %dl
> - jnz L(exit_tail2)
> -
> - test $0x08, %dl
> - jnz L(exit_tail3)
> -
> - test $0x10, %dl
> - jnz L(exit_tail4)
> -
> - test $0x20, %dl
> - jnz L(exit_tail5)
> -
> - test $0x40, %dl
> - jnz L(exit_tail6)
> - add $7, %eax
> -L(exit_tail0):
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_high):
> - add $8, %eax
> - test $0x01, %dh
> - jnz L(exit_tail0)
> -
> - test $0x02, %dh
> - jnz L(exit_tail1)
> -
> - test $0x04, %dh
> - jnz L(exit_tail2)
> -
> - test $0x08, %dh
> - jnz L(exit_tail3)
> -
> - test $0x10, %dh
> - jnz L(exit_tail4)
> -
> - test $0x20, %dh
> - jnz L(exit_tail5)
> -
> - test $0x40, %dh
> - jnz L(exit_tail6)
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail1):
> - add $1, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail2):
> - add $2, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail3):
> - add $3, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail4):
> - add $4, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail5):
> - add $5, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail6):
> - add $6, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail7):
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail8):
> - add $8, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail9):
> - add $9, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail10):
> - add $10, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail11):
> - add $11, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail12):
> - add $12, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail13):
> - add $13, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail14):
> - add $14, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail15):
> - add $15, %eax
> -
> - .p2align 4
> -L(StartStrcpyPart):
> - mov %rsi, %rcx
> - lea (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> - test %r8, %r8
> - jz L(StrncatExit0)
> - cmp $8, %r8
> - jbe L(StrncatExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - jb L(StrncatExit15Bytes)
> -# endif
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - je L(StrncatExit16)
> -# define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit1):
> - xor %ah, %ah
> - movb %ah, 1(%rdx)
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit2):
> - xor %ah, %ah
> - movb %ah, 2(%rdx)
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit3):
> - xor %ah, %ah
> - movb %ah, 3(%rdx)
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit4):
> - xor %ah, %ah
> - movb %ah, 4(%rdx)
> -L(Exit4):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit5):
> - xor %ah, %ah
> - movb %ah, 5(%rdx)
> -L(Exit5):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit6):
> - xor %ah, %ah
> - movb %ah, 6(%rdx)
> -L(Exit6):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit7):
> - xor %ah, %ah
> - movb %ah, 7(%rdx)
> -L(Exit7):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov 3(%rcx), %eax
> - mov %eax, 3(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8):
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> -L(Exit8):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit9):
> - xor %ah, %ah
> - movb %ah, 9(%rdx)
> -L(Exit9):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movb 8(%rcx), %al
> - movb %al, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit10):
> - xor %ah, %ah
> - movb %ah, 10(%rdx)
> -L(Exit10):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movw 8(%rcx), %ax
> - movw %ax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit11):
> - xor %ah, %ah
> - movb %ah, 11(%rdx)
> -L(Exit11):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit12):
> - xor %ah, %ah
> - movb %ah, 12(%rdx)
> -L(Exit12):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit13):
> - xor %ah, %ah
> - movb %ah, 13(%rdx)
> -L(Exit13):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 5(%rcx), %xmm1
> - movlpd %xmm1, 5(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit14):
> - xor %ah, %ah
> - movb %ah, 14(%rdx)
> -L(Exit14):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 6(%rcx), %xmm1
> - movlpd %xmm1, 6(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15):
> - xor %ah, %ah
> - movb %ah, 15(%rdx)
> -L(Exit15):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit16):
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> -L(Exit16):
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase2):
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $9, %r8
> - je L(StrncatExit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $8, %r8
> - ja L(ExitHighCase3)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase3):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit0):
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15Bytes):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8Bytes):
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 6c45ff3ec7..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-03-25 19:57 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw)
To: Noah Goldstein; +Cc: libc-alpha, carlos
On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
> sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
> sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
> sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
> sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
> 6 files changed, 3572 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index a2ebc06c5f..292353bad7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -42,13 +42,11 @@ sysdep_routines += \
> stpcpy-evex \
> stpcpy-sse2 \
> stpcpy-sse2-unaligned \
> - stpcpy-ssse3 \
> stpncpy-avx2 \
> stpncpy-avx2-rtm \
> stpncpy-c \
> stpncpy-evex \
> stpncpy-sse2-unaligned \
> - stpncpy-ssse3 \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> strcasecmp_l-evex \
> @@ -79,7 +77,6 @@ sysdep_routines += \
> strcpy-evex \
> strcpy-sse2 \
> strcpy-sse2-unaligned \
> - strcpy-ssse3 \
> strcspn-c \
> strcspn-sse2 \
> strlen-avx2 \
> @@ -106,7 +103,6 @@ sysdep_routines += \
> strncpy-c \
> strncpy-evex \
> strncpy-sse2-unaligned \
> - strncpy-ssse3 \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 4133ed7e43..505b8002e0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
> __stpncpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpcpy.c. */
> IFUNC_IMPL (i, name, stpcpy,
> - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
> __stpcpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcpy_evex)
> - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> - __strcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncpy_evex)
> - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> - __strncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strncpy, 1,
> __strncpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_ssse3
> -# endif
> -
> - .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> - mov %rsi, %rcx
> -# ifdef USE_AS_STRNCPY
> - mov %RDX_LP, %R8_LP
> -# endif
> - mov %rdi, %rdx
> -# ifdef USE_AS_STRNCPY
> - test %R8_LP, %R8_LP
> - jz L(Exit0)
> - cmp $8, %R8_LP
> - jbe L(StrncpyExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - jb L(StrncpyExit15Bytes)
> -# endif
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - je L(Exit16)
> -# endif
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - mov %rcx, %rsi
> - sub $16, %r8
> - and $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> - add %rsi, %r8
> -# endif
> - lea 16(%rcx), %rsi
> - and $-16, %rsi
> - pxor %xmm0, %xmm0
> - mov (%rcx), %r9
> - mov %r9, (%rdx)
> - pcmpeqb (%rsi), %xmm0
> - mov 8(%rcx), %r9
> - mov %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> - pmovmskb %xmm0, %rax
> - sub %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - mov %rdx, %rax
> - lea 16(%rdx), %rdx
> - and $-16, %rdx
> - sub %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> - add %rax, %rsi
> - lea -1(%rsi), %rsi
> - and $1<<31, %esi
> - test %rsi, %rsi
> - jnz L(ContinueCopy)
> - lea 16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> - sub %rax, %rcx
> - mov %rcx, %rax
> - and $0xf, %rax
> - mov $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> - jz L(Align16Both)
> -
> - cmp $8, %rax
> - jae L(ShlHigh8)
> - cmp $1, %rax
> - je L(Shl1)
> - cmp $2, %rax
> - je L(Shl2)
> - cmp $3, %rax
> - je L(Shl3)
> - cmp $4, %rax
> - je L(Shl4)
> - cmp $5, %rax
> - je L(Shl5)
> - cmp $6, %rax
> - je L(Shl6)
> - jmp L(Shl7)
> -
> -L(ShlHigh8):
> - je L(Shl8)
> - cmp $9, %rax
> - je L(Shl9)
> - cmp $10, %rax
> - je L(Shl10)
> - cmp $11, %rax
> - je L(Shl11)
> - cmp $12, %rax
> - je L(Shl12)
> - cmp $13, %rax
> - je L(Shl13)
> - cmp $14, %rax
> - je L(Shl14)
> - jmp L(Shl15)
> -
> -L(Align16Both):
> - movaps (%rcx), %xmm1
> - movaps 16(%rcx), %xmm2
> - movaps %xmm1, (%rdx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm4
> - movaps %xmm3, (%rdx, %rsi)
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm1
> - movaps %xmm4, (%rdx, %rsi)
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm2
> - movaps %xmm1, (%rdx, %rsi)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm3, (%rdx, %rsi)
> - mov %rcx, %rax
> - lea 16(%rcx, %rsi), %rcx
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - lea 112(%r8, %rax), %r8
> -# endif
> - mov $-0x40, %rsi
> -
> - .p2align 4
> -L(Aligned64Loop):
> - movaps (%rcx), %xmm2
> - movaps %xmm2, %xmm4
> - movaps 16(%rcx), %xmm5
> - movaps 32(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 48(%rcx), %xmm7
> - pminub %xmm5, %xmm2
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rax
> - lea 64(%rdx), %rdx
> - lea 64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeaveCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Aligned64Leave)
> - movaps %xmm4, -64(%rdx)
> - movaps %xmm5, -48(%rdx)
> - movaps %xmm6, -32(%rdx)
> - movaps %xmm7, -16(%rdx)
> - jmp L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> - lea 48(%r8), %r8
> -# endif
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm6, -32(%rdx)
> - pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl1):
> - movaps -1(%rcx), %xmm1
> - movaps 15(%rcx), %xmm2
> -L(Shl1Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 31(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -15(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl1LoopStart):
> - movaps 15(%rcx), %xmm2
> - movaps 31(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 47(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 63(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $1, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $1, %xmm3, %xmm4
> - jnz L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave1)
> -# endif
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> - movdqu -1(%rcx), %xmm1
> - mov $15, %rsi
> - movdqu %xmm1, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl2):
> - movaps -2(%rcx), %xmm1
> - movaps 14(%rcx), %xmm2
> -L(Shl2Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 30(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -14(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl2LoopStart):
> - movaps 14(%rcx), %xmm2
> - movaps 30(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 46(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 62(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $2, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $2, %xmm3, %xmm4
> - jnz L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave2)
> -# endif
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> - movdqu -2(%rcx), %xmm1
> - mov $14, %rsi
> - movdqu %xmm1, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl3):
> - movaps -3(%rcx), %xmm1
> - movaps 13(%rcx), %xmm2
> -L(Shl3Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 29(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -13(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl3LoopStart):
> - movaps 13(%rcx), %xmm2
> - movaps 29(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 45(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 61(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $3, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $3, %xmm3, %xmm4
> - jnz L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave3)
> -# endif
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> - movdqu -3(%rcx), %xmm1
> - mov $13, %rsi
> - movdqu %xmm1, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl4):
> - movaps -4(%rcx), %xmm1
> - movaps 12(%rcx), %xmm2
> -L(Shl4Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 28(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -12(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl4LoopStart):
> - movaps 12(%rcx), %xmm2
> - movaps 28(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 44(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 60(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $4, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $4, %xmm3, %xmm4
> - jnz L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave4)
> -# endif
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> - movdqu -4(%rcx), %xmm1
> - mov $12, %rsi
> - movdqu %xmm1, -4(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl5):
> - movaps -5(%rcx), %xmm1
> - movaps 11(%rcx), %xmm2
> -L(Shl5Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 27(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -11(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl5LoopStart):
> - movaps 11(%rcx), %xmm2
> - movaps 27(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 43(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 59(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $5, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $5, %xmm3, %xmm4
> - jnz L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave5)
> -# endif
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> - movdqu -5(%rcx), %xmm1
> - mov $11, %rsi
> - movdqu %xmm1, -5(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl6):
> - movaps -6(%rcx), %xmm1
> - movaps 10(%rcx), %xmm2
> -L(Shl6Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 26(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -10(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl6LoopStart):
> - movaps 10(%rcx), %xmm2
> - movaps 26(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 42(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 58(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $6, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $6, %xmm3, %xmm4
> - jnz L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave6)
> -# endif
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> - mov (%rcx), %r9
> - mov 6(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 6(%rdx)
> - mov $10, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl7):
> - movaps -7(%rcx), %xmm1
> - movaps 9(%rcx), %xmm2
> -L(Shl7Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 25(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -9(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl7LoopStart):
> - movaps 9(%rcx), %xmm2
> - movaps 25(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 41(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 57(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $7, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $7, %xmm3, %xmm4
> - jnz L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave7)
> -# endif
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> - mov (%rcx), %r9
> - mov 5(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 5(%rdx)
> - mov $9, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl8):
> - movaps -8(%rcx), %xmm1
> - movaps 8(%rcx), %xmm2
> -L(Shl8Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 24(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -8(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl8LoopStart):
> - movaps 8(%rcx), %xmm2
> - movaps 24(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 40(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 56(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $8, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $8, %xmm3, %xmm4
> - jnz L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave8)
> -# endif
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl9):
> - movaps -9(%rcx), %xmm1
> - movaps 7(%rcx), %xmm2
> -L(Shl9Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 23(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -7(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl9LoopStart):
> - movaps 7(%rcx), %xmm2
> - movaps 23(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 39(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 55(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $9, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $9, %xmm3, %xmm4
> - jnz L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave9)
> -# endif
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl10):
> - movaps -10(%rcx), %xmm1
> - movaps 6(%rcx), %xmm2
> -L(Shl10Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 22(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -6(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl10LoopStart):
> - movaps 6(%rcx), %xmm2
> - movaps 22(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 38(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 54(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $10, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $10, %xmm3, %xmm4
> - jnz L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave10)
> -# endif
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl11):
> - movaps -11(%rcx), %xmm1
> - movaps 5(%rcx), %xmm2
> -L(Shl11Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 21(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -5(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl11LoopStart):
> - movaps 5(%rcx), %xmm2
> - movaps 21(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 37(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 53(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $11, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $11, %xmm3, %xmm4
> - jnz L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave11)
> -# endif
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl12):
> - movaps -12(%rcx), %xmm1
> - movaps 4(%rcx), %xmm2
> -L(Shl12Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 20(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -4(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl12LoopStart):
> - movaps 4(%rcx), %xmm2
> - movaps 20(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 36(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 52(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $12, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $12, %xmm3, %xmm4
> - jnz L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave12)
> -# endif
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl13):
> - movaps -13(%rcx), %xmm1
> - movaps 3(%rcx), %xmm2
> -L(Shl13Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 19(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -3(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl13LoopStart):
> - movaps 3(%rcx), %xmm2
> - movaps 19(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 35(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 51(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $13, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $13, %xmm3, %xmm4
> - jnz L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave13)
> -# endif
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl14):
> - movaps -14(%rcx), %xmm1
> - movaps 2(%rcx), %xmm2
> -L(Shl14Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 18(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -2(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl14LoopStart):
> - movaps 2(%rcx), %xmm2
> - movaps 18(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 34(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 50(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $14, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $14, %xmm3, %xmm4
> - jnz L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave14)
> -# endif
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl15):
> - movaps -15(%rcx), %xmm1
> - movaps 1(%rcx), %xmm2
> -L(Shl15Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 17(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -1(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl15LoopStart):
> - movaps 1(%rcx), %xmm2
> - movaps 17(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 33(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 49(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $15, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $15, %xmm3, %xmm4
> - jnz L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave15)
> -# endif
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> - jmp L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> -# ifdef USE_AS_STRNCPY
> - add $16, %r8
> -# endif
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> -
> - .p2align 4
> -L(Exit8):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $8, %r8
> - lea 8(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> -
> - .p2align 4
> -L(Exit16):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %rax
> - mov %rax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - lea 16(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - cmp $1, %r8
> - je L(Exit1)
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - test $0x40, %al
> - jnz L(Exit7)
> - jmp L(Exit8)
> -
> - .p2align 4
> -L(ExitHighCase2):
> - cmp $9, %r8
> - je L(Exit9)
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $15, %r8
> - je L(Exit15)
> - test $0x40, %ah
> - jnz L(Exit15)
> - jmp L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $16, %r8
> - je L(Exit16)
> - cmp $8, %r8
> - je L(Exit8)
> - jg L(More8Case3)
> - cmp $4, %r8
> - je L(Exit4)
> - jg L(More4Case3)
> - cmp $2, %r8
> - jl L(Exit1)
> - je L(Exit2)
> - jg L(Exit3)
> -L(More8Case3): /* but less than 16 */
> - cmp $12, %r8
> - je L(Exit12)
> - jl L(Less12Case3)
> - cmp $14, %r8
> - jl L(Exit13)
> - je L(Exit14)
> - jg L(Exit15)
> -L(More4Case3): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Exit5)
> - je L(Exit6)
> - jg L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Exit9)
> - je L(Exit10)
> - jg L(Exit11)
> -# endif
> -
> - .p2align 4
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea (%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $1, %r8
> - lea 1(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $2, %r8
> - lea 2(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $3, %r8
> - lea 3(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $4, %r8
> - lea 4(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit5):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $5, %r8
> - lea 5(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $6, %r8
> - lea 6(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movl 3(%rcx), %eax
> - movl %eax, 3(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $7, %r8
> - lea 7(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit9):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %eax
> - mov %eax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $9, %r8
> - lea 9(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %eax
> - mov %eax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $10, %r8
> - lea 10(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $11, %r8
> - lea 11(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $12, %r8
> - lea 12(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit13):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %rax
> - mov %rax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $13, %r8
> - lea 13(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %rax
> - mov %rax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $14, %r8
> - lea 14(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $15, %r8
> - lea 15(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(Fill0):
> - ret
> -
> - .p2align 4
> -L(Fill1):
> - movb %dl, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill2):
> - movw %dx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill3):
> - movw %dx, (%rcx)
> - movb %dl, 2(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill4):
> - movl %edx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill5):
> - movl %edx, (%rcx)
> - movb %dl, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill6):
> - movl %edx, (%rcx)
> - movw %dx, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill7):
> - movl %edx, (%rcx)
> - movl %edx, 3(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill8):
> - mov %rdx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill9):
> - mov %rdx, (%rcx)
> - movb %dl, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill10):
> - mov %rdx, (%rcx)
> - movw %dx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill11):
> - mov %rdx, (%rcx)
> - movl %edx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill12):
> - mov %rdx, (%rcx)
> - movl %edx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill13):
> - mov %rdx, (%rcx)
> - mov %rdx, 5(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill14):
> - mov %rdx, (%rcx)
> - mov %rdx, 6(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill15):
> - mov %rdx, (%rcx)
> - mov %rdx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill16):
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(StrncpyFillExit1):
> - lea 16(%r8), %r8
> -L(FillFrom1To16Bytes):
> - test %r8, %r8
> - jz L(Fill0)
> - cmp $16, %r8
> - je L(Fill16)
> - cmp $8, %r8
> - je L(Fill8)
> - jg L(FillMore8)
> - cmp $4, %r8
> - je L(Fill4)
> - jg L(FillMore4)
> - cmp $2, %r8
> - jl L(Fill1)
> - je L(Fill2)
> - jg L(Fill3)
> -L(FillMore8): /* but less than 16 */
> - cmp $12, %r8
> - je L(Fill12)
> - jl L(FillLess12)
> - cmp $14, %r8
> - jl L(Fill13)
> - je L(Fill14)
> - jg L(Fill15)
> -L(FillMore4): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Fill5)
> - je L(Fill6)
> - jg L(Fill7)
> -L(FillLess12): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Fill9)
> - je L(Fill10)
> - jmp L(Fill11)
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero1):
> - xor %rdx, %rdx
> - sub $16, %r8
> - jbe L(StrncpyFillExit1)
> -
> - pxor %xmm0, %xmm0
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> -
> - lea 16(%rcx), %rcx
> -
> - mov %rcx, %rdx
> - and $0xf, %rdx
> - sub %rdx, %rcx
> - add %rdx, %r8
> - xor %rdx, %rdx
> - sub $64, %r8
> - jb L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - movdqa %xmm0, 32(%rcx)
> - movdqa %xmm0, 48(%rcx)
> - lea 64(%rcx), %rcx
> - sub $64, %r8
> - jae L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> - add $32, %r8
> - jl L(StrncpyFillLess32)
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - lea 32(%rcx), %rcx
> - sub $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> - add $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> - .p2align 4
> -L(Exit0):
> - mov %rdx, %rax
> - ret
> -
> - .p2align 4
> -L(StrncpyExit15Bytes):
> - cmp $9, %r8
> - je L(Exit9)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit8Bytes):
> - cmp $1, %r8
> - je L(Exit1)
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> -# endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> - test %rax, %rax
> - jnz L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> - lea 64(%r8), %r8
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - add $48, %r8
> - jle L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> - .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> - movdqu -1(%rcx), %xmm0
> - movdqu %xmm0, -1(%rdx)
> - mov $15, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> - movdqu -2(%rcx), %xmm0
> - movdqu %xmm0, -2(%rdx)
> - mov $14, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> - movdqu -3(%rcx), %xmm0
> - movdqu %xmm0, -3(%rdx)
> - mov $13, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> - movdqu -4(%rcx), %xmm0
> - movdqu %xmm0, -4(%rdx)
> - mov $12, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> - movdqu -5(%rcx), %xmm0
> - movdqu %xmm0, -5(%rdx)
> - mov $11, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 6(%rcx), %r9d
> - mov %r9d, 6(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $10, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 5(%rcx), %r9d
> - mov %r9d, 5(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $9, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave1):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit1)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit1):
> - lea 15(%rdx, %rsi), %rdx
> - lea 15(%rcx, %rsi), %rcx
> - mov -15(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -15(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave2):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit2)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit2):
> - lea 14(%rdx, %rsi), %rdx
> - lea 14(%rcx, %rsi), %rcx
> - mov -14(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -14(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave3):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit3)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit3):
> - lea 13(%rdx, %rsi), %rdx
> - lea 13(%rcx, %rsi), %rcx
> - mov -13(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -13(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave4):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit4)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit4):
> - lea 12(%rdx, %rsi), %rdx
> - lea 12(%rcx, %rsi), %rcx
> - mov -12(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -12(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave5):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit5)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit5):
> - lea 11(%rdx, %rsi), %rdx
> - lea 11(%rcx, %rsi), %rcx
> - mov -11(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -11(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave6):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit6)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit6):
> - lea 10(%rdx, %rsi), %rdx
> - lea 10(%rcx, %rsi), %rcx
> - mov -10(%rcx), %rsi
> - movw -2(%rcx), %ax
> - mov %rsi, -10(%rdx)
> - movw %ax, -2(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave7):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit7)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit7):
> - lea 9(%rdx, %rsi), %rdx
> - lea 9(%rcx, %rsi), %rcx
> - mov -9(%rcx), %rsi
> - movb -1(%rcx), %ah
> - mov %rsi, -9(%rdx)
> - movb %ah, -1(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave8):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit8)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit8):
> - lea 8(%rdx, %rsi), %rdx
> - lea 8(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave9):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit9)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit9):
> - lea 7(%rdx, %rsi), %rdx
> - lea 7(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave10):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit10)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit10):
> - lea 6(%rdx, %rsi), %rdx
> - lea 6(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave11):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit11)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit11):
> - lea 5(%rdx, %rsi), %rdx
> - lea 5(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave12):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit12)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit12):
> - lea 4(%rdx, %rsi), %rdx
> - lea 4(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave13):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit13)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit13):
> - lea 3(%rdx, %rsi), %rdx
> - lea 3(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave14):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit14)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit14):
> - lea 2(%rdx, %rsi), %rdx
> - lea 2(%rcx, %rsi), %rcx
> - movw -2(%rcx), %ax
> - xor %rsi, %rsi
> - movw %ax, -2(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave15):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit15)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit15):
> - lea 1(%rdx, %rsi), %rdx
> - lea 1(%rcx, %rsi), %rcx
> - movb -1(%rcx), %ah
> - xor %rsi, %rsi
> - movb %ah, -1(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (5 preceding siblings ...)
2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
@ 2022-03-25 20:34 ` Andreas Schwab
2022-03-25 20:40 ` Noah Goldstein
6 siblings, 1 reply; 49+ messages in thread
From: Andreas Schwab @ 2022-03-25 20:34 UTC (permalink / raw)
To: Noah Goldstein via Libc-alpha
On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
I think the second sentence is missing something. Also: s/its/it is/.
--
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
"And now for something completely different."
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 20:34 ` Andreas Schwab
@ 2022-03-25 20:40 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:40 UTC (permalink / raw)
To: Andreas Schwab; +Cc: Noah Goldstein via Libc-alpha
On Fri, Mar 25, 2022 at 3:34 PM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:
>
> > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > SSSE3. As a result its no longer with the code size cost.
>
> I think the second sentence is missing something. Also: s/its/it is/.
^
Hows:
"As a result it is no longer worth it to keep the SSSE3 versions given
the code size cost."
>
> --
> Andreas Schwab, schwab@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510 2552 DF73 E780 A9DA AEC1
> "And now for something completely different."
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 19:55 ` H.J. Lu
@ 2022-03-25 20:44 ` Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (4 more replies)
2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein
` (7 subsequent siblings)
9 siblings, 5 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
5 files changed, 2006 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
memcmp-evex-movbe \
memcmp-sse2 \
memcmp-sse4 \
- memcmp-ssse3 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
wmemcmp-c \
wmemcmp-evex-movbe \
wmemcmp-sse4 \
- wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
- __memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
- __wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
- test %RDX_LP, %RDX_LP
- jz L(equal)
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-03-25 20:44 ` Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
` (3 subsequent siblings)
4 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
sysdeps/x86_64/multiarch/strcmp.c | 4 -
sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
sysdeps/x86_64/multiarch/strncmp.c | 4 -
sysdeps/x86_64/strcmp.S | 155 ++++--------------
10 files changed, 30 insertions(+), 202 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
- strcasecmp_l-ssse3 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
- strcmp-ssse3 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
- strncase_l-ssse3 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
- strncmp-ssse3 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
__strcasecmp_l_sse2))
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strcmp_evex)
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
__strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
- __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
__strncasecmp_sse2))
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
__strncasecmp_l_sse2))
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncmp_evex)
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
__strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
- __strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
# endif
#endif
-#ifndef USE_SSSE3
.text
-#else
- .section .text.ssse3,"ax",@progbits
-#endif
-
#ifdef USE_AS_STRCASECMP_L
# ifndef ENTRY2
# define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 20:44 ` Noah Goldstein
2022-04-10 0:57 ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
` (2 subsequent siblings)
4 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 18 +-
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 --------------------
sysdeps/x86_64/multiarch/memmove-ssse3.S | 4 -
5 files changed, 7 insertions(+), 3183 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..48f81711ae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
@@ -24,7 +23,6 @@ sysdep_routines += \
memmove-avx512-unaligned-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
- memmove-ssse3 \
memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..70b0e9c62e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned)
@@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
@@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3_back)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3_back)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..1ecdd4b0d3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
+ return OPTIMIZE (ssse3_back);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 20:44 ` Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
4 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 7 -
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
5 files changed, 3209 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48f81711ae..323be3b969 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,14 +16,12 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
memmove-avx512-unaligned-erms \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 70b0e9c62e..d6852ab365 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
__memmove_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
IFUNC_IMPL_ADD (array, i, memmove, 1,
__memmove_sse2_unaligned)
@@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
__memcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
__mempcpy_chk_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
__mempcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1ecdd4b0d3..5596ddea2c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
}
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
- {
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
- }
-
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (sse2_unaligned_erms);
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 5/6] x86: Remove str{n}cat-ssse3
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (2 preceding siblings ...)
2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 20:44 ` Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
4 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
5 files changed, 879 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 323be3b969..a2ebc06c5f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -59,7 +59,6 @@ sysdep_routines += \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
- strcat-ssse3 \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
@@ -97,7 +96,6 @@ sysdep_routines += \
strncat-c \
strncat-evex \
strncat-sse2-unaligned \
- strncat-ssse3 \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d6852ab365..4133ed7e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcat_evex)
- IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
- __strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
@@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncat_evex)
- IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
- __strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (3 preceding siblings ...)
2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 20:44 ` Noah Goldstein
4 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
6 files changed, 3572 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -79,7 +77,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -106,7 +103,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 19:55 ` H.J. Lu
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
` (6 subsequent siblings)
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
Full memcpy ssse3 results. Number are comparison of
geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz
bench-memcpy:
length, align1, align2, dst > src, New Time / Old Time
1, 0, 0, 0, 2.099
1, 0, 0, 1, 2.099
1, 32, 0, 0, 2.103
1, 32, 0, 1, 2.103
1, 0, 32, 0, 2.099
1, 0, 32, 1, 2.098
1, 32, 32, 0, 2.098
1, 32, 32, 1, 2.098
1, 2048, 0, 0, 2.098
1, 2048, 0, 1, 2.098
2, 0, 0, 0, 1.135
2, 0, 0, 1, 1.136
2, 1, 0, 0, 1.139
2, 1, 0, 1, 1.139
2, 33, 0, 0, 1.165
2, 33, 0, 1, 1.139
2, 0, 1, 0, 1.136
2, 0, 1, 1, 1.136
2, 0, 33, 0, 1.136
2, 0, 33, 1, 1.136
2, 1, 1, 0, 1.136
2, 1, 1, 1, 1.136
2, 33, 33, 0, 1.136
2, 33, 33, 1, 1.136
2, 2048, 0, 0, 1.136
2, 2048, 0, 1, 1.136
2, 2049, 0, 0, 1.191
2, 2049, 0, 1, 1.139
2, 2048, 1, 0, 1.136
2, 2048, 1, 1, 1.136
2, 2049, 1, 0, 1.136
2, 2049, 1, 1, 1.136
4, 0, 0, 0, 1.074
4, 0, 0, 1, 0.962
4, 2, 0, 0, 0.973
4, 2, 0, 1, 0.989
4, 34, 0, 0, 0.991
4, 34, 0, 1, 0.991
4, 0, 2, 0, 0.962
4, 0, 2, 1, 0.962
4, 0, 34, 0, 0.962
4, 0, 34, 1, 0.962
4, 2, 2, 0, 0.962
4, 2, 2, 1, 0.962
4, 34, 34, 0, 0.962
4, 34, 34, 1, 0.962
4, 2048, 0, 0, 0.962
4, 2048, 0, 1, 0.962
4, 2050, 0, 0, 0.977
4, 2050, 0, 1, 0.979
4, 2048, 2, 0, 0.962
4, 2048, 2, 1, 0.962
4, 2050, 2, 0, 0.962
4, 2050, 2, 1, 0.962
8, 0, 0, 0, 0.961
8, 0, 0, 1, 0.962
8, 3, 0, 0, 1.0
8, 3, 0, 1, 1.0
8, 35, 0, 0, 1.0
8, 35, 0, 1, 1.0
8, 0, 3, 0, 0.962
8, 0, 3, 1, 0.962
8, 0, 35, 0, 0.962
8, 0, 35, 1, 0.962
8, 3, 3, 0, 0.962
8, 3, 3, 1, 0.962
8, 35, 35, 0, 0.962
8, 35, 35, 1, 0.962
8, 2048, 0, 0, 0.962
8, 2048, 0, 1, 0.962
8, 2051, 0, 0, 1.0
8, 2051, 0, 1, 1.0
8, 2048, 3, 0, 0.962
8, 2048, 3, 1, 0.962
8, 2051, 3, 0, 0.962
8, 2051, 3, 1, 0.962
16, 0, 0, 0, 0.798
16, 0, 0, 1, 0.799
16, 4, 0, 0, 0.8
16, 4, 0, 1, 0.801
16, 36, 0, 0, 0.801
16, 36, 0, 1, 0.8
16, 0, 4, 0, 0.798
16, 0, 4, 1, 0.798
16, 0, 36, 0, 0.798
16, 0, 36, 1, 0.798
16, 4, 4, 0, 0.798
16, 4, 4, 1, 0.798
16, 36, 36, 0, 0.798
16, 36, 36, 1, 0.798
16, 2048, 0, 0, 0.798
16, 2048, 0, 1, 0.799
16, 2052, 0, 0, 0.8
16, 2052, 0, 1, 0.8
16, 2048, 4, 0, 0.798
16, 2048, 4, 1, 0.798
16, 2052, 4, 0, 0.798
16, 2052, 4, 1, 0.798
32, 0, 0, 0, 0.471
32, 0, 0, 1, 0.471
32, 5, 0, 0, 0.471
32, 5, 0, 1, 0.471
32, 37, 0, 0, 0.961
32, 37, 0, 1, 0.961
32, 0, 5, 0, 0.471
32, 0, 5, 1, 0.471
32, 0, 37, 0, 1.021
32, 0, 37, 1, 1.021
32, 5, 5, 0, 0.471
32, 5, 5, 1, 0.471
32, 37, 37, 0, 1.011
32, 37, 37, 1, 1.011
32, 2048, 0, 0, 0.471
32, 2048, 0, 1, 0.471
32, 2053, 0, 0, 0.471
32, 2053, 0, 1, 0.471
32, 2048, 5, 0, 0.471
32, 2048, 5, 1, 0.471
32, 2053, 5, 0, 0.471
32, 2053, 5, 1, 0.471
64, 0, 0, 0, 1.0
64, 0, 0, 1, 1.0
64, 6, 0, 0, 0.862
64, 6, 0, 1, 0.862
64, 38, 0, 0, 0.912
64, 38, 0, 1, 0.912
64, 0, 6, 0, 0.896
64, 0, 6, 1, 0.896
64, 0, 38, 0, 0.906
64, 0, 38, 1, 0.906
64, 6, 6, 0, 0.91
64, 6, 6, 1, 0.91
64, 38, 38, 0, 0.883
64, 38, 38, 1, 0.883
64, 2048, 0, 0, 1.0
64, 2048, 0, 1, 1.0
64, 2054, 0, 0, 0.862
64, 2054, 0, 1, 0.862
64, 2048, 6, 0, 0.887
64, 2048, 6, 1, 0.887
64, 2054, 6, 0, 0.887
64, 2054, 6, 1, 0.887
128, 0, 0, 0, 0.857
128, 0, 0, 1, 0.857
128, 7, 0, 0, 0.875
128, 7, 0, 1, 0.875
128, 39, 0, 0, 0.892
128, 39, 0, 1, 0.892
128, 0, 7, 0, 1.183
128, 0, 7, 1, 1.183
128, 0, 39, 0, 1.113
128, 0, 39, 1, 1.113
128, 7, 7, 0, 0.692
128, 7, 7, 1, 0.692
128, 39, 39, 0, 1.104
128, 39, 39, 1, 1.104
128, 2048, 0, 0, 0.857
128, 2048, 0, 1, 0.857
128, 2055, 0, 0, 0.875
128, 2055, 0, 1, 0.875
128, 2048, 7, 0, 0.959
128, 2048, 7, 1, 0.959
128, 2055, 7, 0, 1.036
128, 2055, 7, 1, 1.036
256, 0, 0, 0, 0.889
256, 0, 0, 1, 0.889
256, 8, 0, 0, 0.966
256, 8, 0, 1, 0.966
256, 40, 0, 0, 0.983
256, 40, 0, 1, 0.983
256, 0, 8, 0, 1.29
256, 0, 8, 1, 1.29
256, 0, 40, 0, 1.274
256, 0, 40, 1, 1.274
256, 8, 8, 0, 0.865
256, 8, 8, 1, 0.865
256, 40, 40, 0, 1.477
256, 40, 40, 1, 1.477
256, 2048, 0, 0, 0.889
256, 2048, 0, 1, 0.889
256, 2056, 0, 0, 0.966
256, 2056, 0, 1, 0.966
256, 2048, 8, 0, 0.952
256, 2048, 8, 1, 0.952
256, 2056, 8, 0, 0.878
256, 2056, 8, 1, 0.878
512, 0, 0, 0, 1.077
512, 0, 0, 1, 1.077
512, 9, 0, 0, 1.001
512, 9, 0, 1, 1.0
512, 41, 0, 0, 0.954
512, 41, 0, 1, 0.954
512, 0, 9, 0, 1.191
512, 0, 9, 1, 1.191
512, 0, 41, 0, 1.181
512, 0, 41, 1, 1.181
512, 9, 9, 0, 0.765
512, 9, 9, 1, 0.765
512, 41, 41, 0, 0.905
512, 41, 41, 1, 0.905
512, 2048, 0, 0, 1.077
512, 2048, 0, 1, 1.077
512, 2057, 0, 0, 1.0
512, 2057, 0, 1, 1.0
512, 2048, 9, 0, 1.0
512, 2048, 9, 1, 1.0
512, 2057, 9, 0, 0.733
512, 2057, 9, 1, 0.733
1024, 0, 0, 0, 1.143
1024, 0, 0, 1, 1.143
1024, 10, 0, 0, 1.015
1024, 10, 0, 1, 1.015
1024, 42, 0, 0, 1.045
1024, 42, 0, 1, 1.045
1024, 0, 10, 0, 1.126
1024, 0, 10, 1, 1.126
1024, 0, 42, 0, 1.114
1024, 0, 42, 1, 1.114
1024, 10, 10, 0, 0.89
1024, 10, 10, 1, 0.89
1024, 42, 42, 0, 0.986
1024, 42, 42, 1, 0.986
1024, 2048, 0, 0, 1.143
1024, 2048, 0, 1, 1.143
1024, 2058, 0, 0, 1.015
1024, 2058, 0, 1, 1.015
1024, 2048, 10, 0, 1.03
1024, 2048, 10, 1, 1.03
1024, 2058, 10, 0, 0.854
1024, 2058, 10, 1, 0.854
2048, 0, 0, 0, 1.005
2048, 0, 0, 1, 1.005
2048, 11, 0, 0, 1.013
2048, 11, 0, 1, 1.014
2048, 43, 0, 0, 1.044
2048, 43, 0, 1, 1.044
2048, 0, 11, 0, 1.003
2048, 0, 11, 1, 1.003
2048, 0, 43, 0, 1.003
2048, 0, 43, 1, 1.003
2048, 11, 11, 0, 0.92
2048, 11, 11, 1, 0.92
2048, 43, 43, 0, 1.0
2048, 43, 43, 1, 1.0
2048, 2048, 0, 0, 1.005
2048, 2048, 0, 1, 1.005
2048, 2059, 0, 0, 0.904
2048, 2059, 0, 1, 0.904
2048, 2048, 11, 0, 1.0
2048, 2048, 11, 1, 1.0
2048, 2059, 11, 0, 0.979
2048, 2059, 11, 1, 0.979
4096, 0, 0, 0, 1.014
4096, 0, 0, 1, 1.014
4096, 12, 0, 0, 0.855
4096, 12, 0, 1, 0.855
4096, 44, 0, 0, 0.857
4096, 44, 0, 1, 0.857
4096, 0, 12, 0, 0.932
4096, 0, 12, 1, 0.932
4096, 0, 44, 0, 0.932
4096, 0, 44, 1, 0.932
4096, 12, 12, 0, 0.999
4096, 12, 12, 1, 0.999
4096, 44, 44, 0, 1.051
4096, 44, 44, 1, 1.051
4096, 2048, 0, 0, 1.014
4096, 2048, 0, 1, 1.014
4096, 2060, 0, 0, 0.98
4096, 2060, 0, 1, 0.98
4096, 2048, 12, 0, 0.77
4096, 2048, 12, 1, 0.77
4096, 2060, 12, 0, 0.943
4096, 2060, 12, 1, 0.943
8192, 0, 0, 0, 1.046
8192, 0, 0, 1, 1.046
8192, 13, 0, 0, 0.885
8192, 13, 0, 1, 0.885
8192, 45, 0, 0, 0.887
8192, 45, 0, 1, 0.886
8192, 0, 13, 0, 0.942
8192, 0, 13, 1, 0.942
8192, 0, 45, 0, 0.942
8192, 0, 45, 1, 0.942
8192, 13, 13, 0, 1.03
8192, 13, 13, 1, 1.03
8192, 45, 45, 0, 1.048
8192, 45, 45, 1, 1.048
8192, 2048, 0, 0, 1.048
8192, 2048, 0, 1, 1.048
8192, 2061, 0, 0, 1.011
8192, 2061, 0, 1, 1.011
8192, 2048, 13, 0, 0.789
8192, 2048, 13, 1, 0.789
8192, 2061, 13, 0, 0.991
8192, 2061, 13, 1, 0.991
16384, 0, 0, 0, 1.014
16384, 0, 0, 1, 1.008
16384, 14, 0, 0, 0.951
16384, 14, 0, 1, 0.95
16384, 46, 0, 0, 0.874
16384, 46, 0, 1, 0.871
16384, 0, 14, 0, 0.813
16384, 0, 14, 1, 0.81
16384, 0, 46, 0, 0.85
16384, 0, 46, 1, 0.86
16384, 14, 14, 0, 0.985
16384, 14, 14, 1, 0.975
16384, 46, 46, 0, 1.025
16384, 46, 46, 1, 1.027
16384, 2048, 0, 0, 1.058
16384, 2048, 0, 1, 1.058
16384, 2062, 0, 0, 0.849
16384, 2062, 0, 1, 0.848
16384, 2048, 14, 0, 0.907
16384, 2048, 14, 1, 0.907
16384, 2062, 14, 0, 0.988
16384, 2062, 14, 1, 0.995
32768, 0, 0, 0, 0.979
32768, 0, 0, 1, 0.979
32768, 15, 0, 0, 1.006
32768, 15, 0, 1, 1.006
32768, 47, 0, 0, 1.004
32768, 47, 0, 1, 1.004
32768, 0, 15, 0, 1.045
32768, 0, 15, 1, 1.045
32768, 0, 47, 0, 1.011
32768, 0, 47, 1, 1.012
32768, 15, 15, 0, 0.977
32768, 15, 15, 1, 0.977
32768, 47, 47, 0, 0.96
32768, 47, 47, 1, 0.96
32768, 2048, 0, 0, 0.978
32768, 2048, 0, 1, 0.978
32768, 2063, 0, 0, 1.004
32768, 2063, 0, 1, 1.004
32768, 2048, 15, 0, 1.036
32768, 2048, 15, 1, 1.036
32768, 2063, 15, 0, 0.978
32768, 2063, 15, 1, 0.978
65536, 0, 0, 0, 0.981
65536, 0, 0, 1, 0.981
65536, 16, 0, 0, 0.987
65536, 16, 0, 1, 0.987
65536, 48, 0, 0, 0.968
65536, 48, 0, 1, 0.968
65536, 0, 16, 0, 1.014
65536, 0, 16, 1, 1.014
65536, 0, 48, 0, 0.984
65536, 0, 48, 1, 0.984
65536, 16, 16, 0, 1.01
65536, 16, 16, 1, 1.01
65536, 48, 48, 0, 0.968
65536, 48, 48, 1, 0.968
65536, 2048, 0, 0, 0.982
65536, 2048, 0, 1, 0.982
65536, 2064, 0, 0, 0.987
65536, 2064, 0, 1, 0.987
65536, 2048, 16, 0, 1.012
65536, 2048, 16, 1, 1.012
65536, 2064, 16, 0, 1.007
65536, 2064, 16, 1, 1.007
0, 0, 0, 0, 2.104
0, 2048, 0, 0, 2.104
0, 4095, 0, 0, 2.109
0, 0, 4095, 0, 2.103
1, 1, 0, 0, 2.104
1, 0, 1, 0, 2.098
1, 1, 1, 0, 2.098
1, 2049, 0, 0, 2.102
1, 2048, 1, 0, 2.098
1, 2049, 1, 0, 2.098
1, 4095, 0, 0, 2.103
1, 0, 4095, 0, 2.098
2, 2, 0, 0, 1.139
2, 0, 2, 0, 1.136
2, 2, 2, 0, 1.136
2, 2050, 0, 0, 1.139
2, 2048, 2, 0, 1.136
2, 2050, 2, 0, 1.136
2, 4095, 0, 0, 1.0
2, 0, 4095, 0, 1.022
3, 0, 0, 0, 0.981
3, 3, 0, 0, 0.984
3, 0, 3, 0, 0.982
3, 3, 3, 0, 0.982
3, 2048, 0, 0, 0.982
3, 2051, 0, 0, 0.983
3, 2048, 3, 0, 0.982
3, 2051, 3, 0, 0.982
3, 4095, 0, 0, 0.285
3, 0, 4095, 0, 0.231
4, 4, 0, 0, 1.373
4, 0, 4, 0, 1.31
4, 4, 4, 0, 1.282
4, 2052, 0, 0, 1.264
4, 2048, 4, 0, 1.254
4, 2052, 4, 0, 1.254
4, 4095, 0, 0, 1.971
4, 0, 4095, 0, 1.994
5, 0, 0, 0, 1.145
5, 5, 0, 0, 1.155
5, 0, 5, 0, 1.171
5, 5, 5, 0, 1.171
5, 2048, 0, 0, 1.197
5, 2053, 0, 0, 1.173
5, 2048, 5, 0, 1.171
5, 2053, 5, 0, 1.171
5, 4095, 0, 0, 0.935
5, 0, 4095, 0, 1.017
6, 0, 0, 0, 1.145
6, 6, 0, 0, 1.098
6, 0, 6, 0, 1.096
6, 6, 6, 0, 1.096
6, 2048, 0, 0, 1.12
6, 2054, 0, 0, 1.122
6, 2048, 6, 0, 1.12
6, 2054, 6, 0, 1.096
6, 4095, 0, 0, 0.935
6, 0, 4095, 0, 1.018
7, 0, 0, 0, 1.071
7, 7, 0, 0, 1.074
7, 0, 7, 0, 1.072
7, 7, 7, 0, 1.072
7, 2048, 0, 0, 1.096
7, 2055, 0, 0, 1.098
7, 2048, 7, 0, 1.096
7, 2055, 7, 0, 1.096
7, 4095, 0, 0, 0.935
7, 0, 4095, 0, 1.016
8, 8, 0, 0, 1.167
8, 0, 8, 0, 1.028
8, 8, 8, 0, 1.028
8, 2056, 0, 0, 1.069
8, 2048, 8, 0, 1.028
8, 2056, 8, 0, 1.028
8, 4095, 0, 0, 1.029
8, 0, 4095, 0, 1.043
9, 0, 0, 0, 0.799
9, 9, 0, 0, 0.801
9, 0, 9, 0, 0.799
9, 9, 9, 0, 0.799
9, 2048, 0, 0, 0.8
9, 2057, 0, 0, 0.801
9, 2048, 9, 0, 0.8
9, 2057, 9, 0, 0.799
9, 4095, 0, 0, 0.909
9, 0, 4095, 0, 1.0
10, 0, 0, 0, 0.799
10, 10, 0, 0, 0.801
10, 0, 10, 0, 0.8
10, 10, 10, 0, 0.8
10, 2048, 0, 0, 0.8
10, 2058, 0, 0, 0.801
10, 2048, 10, 0, 0.8
10, 2058, 10, 0, 0.8
10, 4095, 0, 0, 0.909
10, 0, 4095, 0, 1.0
11, 0, 0, 0, 0.799
11, 11, 0, 0, 0.801
11, 0, 11, 0, 0.8
11, 11, 11, 0, 0.8
11, 2048, 0, 0, 0.8
11, 2059, 0, 0, 0.802
11, 2048, 11, 0, 0.8
11, 2059, 11, 0, 0.8
11, 4095, 0, 0, 0.909
11, 0, 4095, 0, 1.0
12, 0, 0, 0, 0.799
12, 12, 0, 0, 0.801
12, 0, 12, 0, 0.8
12, 12, 12, 0, 0.8
12, 2048, 0, 0, 0.8
12, 2060, 0, 0, 0.802
12, 2048, 12, 0, 0.8
12, 2060, 12, 0, 0.8
12, 4095, 0, 0, 0.909
12, 0, 4095, 0, 1.0
13, 0, 0, 0, 0.798
13, 13, 0, 0, 0.801
13, 0, 13, 0, 0.799
13, 13, 13, 0, 0.799
13, 2048, 0, 0, 0.8
13, 2061, 0, 0, 0.801
13, 2048, 13, 0, 0.8
13, 2061, 13, 0, 0.8
13, 4095, 0, 0, 0.909
13, 0, 4095, 0, 1.0
14, 0, 0, 0, 0.799
14, 14, 0, 0, 0.801
14, 0, 14, 0, 0.8
14, 14, 14, 0, 0.8
14, 2048, 0, 0, 0.8
14, 2062, 0, 0, 0.801
14, 2048, 14, 0, 0.8
14, 2062, 14, 0, 0.8
14, 4095, 0, 0, 0.909
14, 0, 4095, 0, 1.0
15, 0, 0, 0, 0.799
15, 15, 0, 0, 0.801
15, 0, 15, 0, 0.8
15, 15, 15, 0, 0.8
15, 2048, 0, 0, 0.8
15, 2063, 0, 0, 0.802
15, 2048, 15, 0, 0.8
15, 2063, 15, 0, 0.8
15, 4095, 0, 0, 0.909
15, 0, 4095, 0, 1.0
16, 16, 0, 0, 0.801
16, 0, 16, 0, 0.799
16, 16, 16, 0, 0.799
16, 2064, 0, 0, 0.801
16, 2048, 16, 0, 0.798
16, 2064, 16, 0, 0.798
16, 4095, 0, 0, 1.818
16, 0, 4095, 0, 1.957
17, 0, 0, 0, 0.798
17, 17, 0, 0, 0.8
17, 0, 17, 0, 0.799
17, 17, 17, 0, 0.798
17, 2048, 0, 0, 0.798
17, 2065, 0, 0, 0.8
17, 2048, 17, 0, 0.798
17, 2065, 17, 0, 0.799
17, 4095, 0, 0, 0.937
17, 0, 4095, 0, 1.021
18, 0, 0, 0, 0.798
18, 18, 0, 0, 0.801
18, 0, 18, 0, 0.798
18, 18, 18, 0, 0.798
18, 2048, 0, 0, 0.799
18, 2066, 0, 0, 0.8
18, 2048, 18, 0, 0.798
18, 2066, 18, 0, 0.798
18, 4095, 0, 0, 0.937
18, 0, 4095, 0, 1.021
19, 0, 0, 0, 0.798
19, 19, 0, 0, 0.8
19, 0, 19, 0, 0.798
19, 19, 19, 0, 0.798
19, 2048, 0, 0, 0.798
19, 2067, 0, 0, 0.8
19, 2048, 19, 0, 0.798
19, 2067, 19, 0, 0.798
19, 4095, 0, 0, 0.937
19, 0, 4095, 0, 1.021
20, 0, 0, 0, 0.798
20, 20, 0, 0, 0.8
20, 0, 20, 0, 0.798
20, 20, 20, 0, 0.798
20, 2048, 0, 0, 0.798
20, 2068, 0, 0, 0.8
20, 2048, 20, 0, 0.798
20, 2068, 20, 0, 0.798
20, 4095, 0, 0, 0.937
20, 0, 4095, 0, 1.021
21, 0, 0, 0, 0.798
21, 21, 0, 0, 0.801
21, 0, 21, 0, 0.798
21, 21, 21, 0, 0.798
21, 2048, 0, 0, 0.798
21, 2069, 0, 0, 0.801
21, 2048, 21, 0, 0.799
21, 2069, 21, 0, 0.798
21, 4095, 0, 0, 0.937
21, 0, 4095, 0, 1.021
22, 0, 0, 0, 0.798
22, 22, 0, 0, 0.801
22, 0, 22, 0, 0.798
22, 22, 22, 0, 0.798
22, 2048, 0, 0, 0.798
22, 2070, 0, 0, 0.801
22, 2048, 22, 0, 0.798
22, 2070, 22, 0, 0.798
22, 4095, 0, 0, 0.937
22, 0, 4095, 0, 1.021
23, 0, 0, 0, 0.798
23, 23, 0, 0, 0.8
23, 0, 23, 0, 0.798
23, 23, 23, 0, 0.798
23, 2048, 0, 0, 0.798
23, 2071, 0, 0, 0.8
23, 2048, 23, 0, 0.798
23, 2071, 23, 0, 0.798
23, 4095, 0, 0, 0.937
23, 0, 4095, 0, 1.021
24, 0, 0, 0, 0.798
24, 24, 0, 0, 0.8
24, 0, 24, 0, 0.799
24, 24, 24, 0, 0.798
24, 2048, 0, 0, 0.798
24, 2072, 0, 0, 0.801
24, 2048, 24, 0, 0.798
24, 2072, 24, 0, 0.798
24, 4095, 0, 0, 0.937
24, 0, 4095, 0, 1.021
25, 0, 0, 0, 0.5
25, 25, 0, 0, 0.5
25, 0, 25, 0, 0.5
25, 25, 25, 0, 0.5
25, 2048, 0, 0, 0.5
25, 2073, 0, 0, 0.501
25, 2048, 25, 0, 0.5
25, 2073, 25, 0, 0.5
25, 4095, 0, 0, 0.974
25, 0, 4095, 0, 0.98
26, 0, 0, 0, 0.5
26, 26, 0, 0, 0.501
26, 0, 26, 0, 0.5
26, 26, 26, 0, 0.501
26, 2048, 0, 0, 0.5
26, 2074, 0, 0, 0.5
26, 2048, 26, 0, 0.5
26, 2074, 26, 0, 0.5
26, 4095, 0, 0, 0.974
26, 0, 4095, 0, 1.0
27, 0, 0, 0, 0.5
27, 27, 0, 0, 0.501
27, 0, 27, 0, 0.5
27, 27, 27, 0, 0.5
27, 2048, 0, 0, 0.5
27, 2075, 0, 0, 0.5
27, 2048, 27, 0, 0.5
27, 2075, 27, 0, 0.5
27, 4095, 0, 0, 0.974
27, 0, 4095, 0, 1.0
28, 0, 0, 0, 0.5
28, 28, 0, 0, 0.501
28, 0, 28, 0, 0.5
28, 28, 28, 0, 0.5
28, 2048, 0, 0, 0.5
28, 2076, 0, 0, 0.5
28, 2048, 28, 0, 0.5
28, 2076, 28, 0, 0.5
28, 4095, 0, 0, 0.974
28, 0, 4095, 0, 1.0
29, 0, 0, 0, 0.471
29, 29, 0, 0, 0.471
29, 0, 29, 0, 0.471
29, 29, 29, 0, 0.471
29, 2048, 0, 0, 0.471
29, 2077, 0, 0, 0.471
29, 2048, 29, 0, 0.471
29, 2077, 29, 0, 0.471
29, 4095, 0, 0, 0.974
29, 0, 4095, 0, 1.0
30, 0, 0, 0, 0.471
30, 30, 0, 0, 0.471
30, 0, 30, 0, 0.471
30, 30, 30, 0, 0.471
30, 2048, 0, 0, 0.471
30, 2078, 0, 0, 0.471
30, 2048, 30, 0, 0.471
30, 2078, 30, 0, 0.471
30, 4095, 0, 0, 0.974
30, 0, 4095, 0, 1.0
31, 0, 0, 0, 0.471
31, 31, 0, 0, 0.471
31, 0, 31, 0, 0.471
31, 31, 31, 0, 0.471
31, 2048, 0, 0, 0.471
31, 2079, 0, 0, 0.471
31, 2048, 31, 0, 0.471
31, 2079, 31, 0, 0.471
31, 4095, 0, 0, 0.974
31, 0, 4095, 0, 1.0
48, 0, 0, 0, 1.0
48, 0, 0, 1, 1.0
48, 3, 0, 0, 1.0
48, 3, 0, 1, 1.0
48, 0, 3, 0, 1.0
48, 0, 3, 1, 1.0
48, 3, 3, 0, 1.0
48, 3, 3, 1, 1.0
48, 2048, 0, 0, 1.0
48, 2048, 0, 1, 1.0
48, 2051, 0, 0, 1.0
48, 2051, 0, 1, 1.0
48, 2048, 3, 0, 1.0
48, 2048, 3, 1, 1.0
48, 2051, 3, 0, 1.0
48, 2051, 3, 1, 1.0
80, 0, 0, 0, 0.781
80, 0, 0, 1, 0.782
80, 5, 0, 0, 0.976
80, 5, 0, 1, 0.976
80, 0, 5, 0, 1.232
80, 0, 5, 1, 1.232
80, 5, 5, 0, 1.542
80, 5, 5, 1, 1.543
80, 2048, 0, 0, 0.781
80, 2048, 0, 1, 0.782
80, 2053, 0, 0, 0.976
80, 2053, 0, 1, 0.976
80, 2048, 5, 0, 1.093
80, 2048, 5, 1, 1.093
80, 2053, 5, 0, 1.371
80, 2053, 5, 1, 1.371
96, 0, 0, 0, 0.758
96, 0, 0, 1, 0.758
96, 6, 0, 0, 0.929
96, 6, 0, 1, 0.929
96, 0, 6, 0, 1.204
96, 0, 6, 1, 1.204
96, 6, 6, 0, 1.562
96, 6, 6, 1, 1.562
96, 2048, 0, 0, 0.758
96, 2048, 0, 1, 0.758
96, 2054, 0, 0, 0.929
96, 2054, 0, 1, 0.929
96, 2048, 6, 0, 1.068
96, 2048, 6, 1, 1.068
96, 2054, 6, 0, 1.562
96, 2054, 6, 1, 1.562
112, 0, 0, 0, 0.736
112, 0, 0, 1, 0.736
112, 7, 0, 0, 0.675
112, 7, 0, 1, 0.675
112, 0, 7, 0, 0.778
112, 0, 7, 1, 0.778
112, 7, 7, 0, 0.909
112, 7, 7, 1, 0.909
112, 2048, 0, 0, 0.736
112, 2048, 0, 1, 0.736
112, 2055, 0, 0, 0.675
112, 2055, 0, 1, 0.675
112, 2048, 7, 0, 0.778
112, 2048, 7, 1, 0.778
112, 2055, 7, 0, 0.909
112, 2055, 7, 1, 0.909
144, 0, 0, 0, 0.857
144, 0, 0, 1, 0.857
144, 9, 0, 0, 0.941
144, 9, 0, 1, 0.943
144, 0, 9, 0, 1.137
144, 0, 9, 1, 1.137
144, 9, 9, 0, 1.514
144, 9, 9, 1, 1.514
144, 2048, 0, 0, 0.857
144, 2048, 0, 1, 0.857
144, 2057, 0, 0, 0.939
144, 2057, 0, 1, 0.945
144, 2048, 9, 0, 0.922
144, 2048, 9, 1, 0.922
144, 2057, 9, 0, 1.514
144, 2057, 9, 1, 1.514
160, 0, 0, 0, 0.698
160, 0, 0, 1, 0.698
160, 10, 0, 0, 0.91
160, 10, 0, 1, 0.91
160, 0, 10, 0, 1.211
160, 0, 10, 1, 1.212
160, 10, 10, 0, 1.357
160, 10, 10, 1, 1.357
160, 2048, 0, 0, 0.698
160, 2048, 0, 1, 0.698
160, 2058, 0, 0, 0.91
160, 2058, 0, 1, 0.91
160, 2048, 10, 0, 0.923
160, 2048, 10, 1, 0.923
160, 2058, 10, 0, 1.357
160, 2058, 10, 1, 1.357
176, 0, 0, 0, 0.796
176, 0, 0, 1, 0.796
176, 11, 0, 0, 0.804
176, 11, 0, 1, 0.804
176, 0, 11, 0, 0.774
176, 0, 11, 1, 0.774
176, 11, 11, 0, 0.814
176, 11, 11, 1, 0.814
176, 2048, 0, 0, 0.796
176, 2048, 0, 1, 0.796
176, 2059, 0, 0, 0.804
176, 2059, 0, 1, 0.804
176, 2048, 11, 0, 0.774
176, 2048, 11, 1, 0.774
176, 2059, 11, 0, 0.814
176, 2059, 11, 1, 0.814
192, 0, 0, 0, 0.778
192, 0, 0, 1, 0.778
192, 12, 0, 0, 0.881
192, 12, 0, 1, 0.881
192, 0, 12, 0, 1.167
192, 0, 12, 1, 1.167
192, 12, 12, 0, 0.841
192, 12, 12, 1, 0.841
192, 2048, 0, 0, 0.778
192, 2048, 0, 1, 0.778
192, 2060, 0, 0, 0.881
192, 2060, 0, 1, 0.881
192, 2048, 12, 0, 0.889
192, 2048, 12, 1, 0.889
192, 2060, 12, 0, 0.906
192, 2060, 12, 1, 0.906
208, 0, 0, 0, 0.833
208, 0, 0, 1, 0.833
208, 13, 0, 0, 0.921
208, 13, 0, 1, 0.921
208, 0, 13, 0, 0.835
208, 0, 13, 1, 0.833
208, 13, 13, 0, 1.333
208, 13, 13, 1, 1.333
208, 2048, 0, 0, 0.833
208, 2048, 0, 1, 0.833
208, 2061, 0, 0, 0.921
208, 2061, 0, 1, 0.921
208, 2048, 13, 0, 0.833
208, 2048, 13, 1, 0.833
208, 2061, 13, 0, 1.333
208, 2061, 13, 1, 1.333
224, 0, 0, 0, 0.93
224, 0, 0, 1, 0.93
224, 14, 0, 0, 1.0
224, 14, 0, 1, 1.0
224, 0, 14, 0, 1.15
224, 0, 14, 1, 1.15
224, 14, 14, 0, 1.452
224, 14, 14, 1, 1.452
224, 2048, 0, 0, 0.93
224, 2048, 0, 1, 0.93
224, 2062, 0, 0, 1.0
224, 2062, 0, 1, 1.0
224, 2048, 14, 0, 0.833
224, 2048, 14, 1, 0.833
224, 2062, 14, 0, 1.452
224, 2062, 14, 1, 1.452
240, 0, 0, 0, 0.909
240, 0, 0, 1, 0.909
240, 15, 0, 0, 0.797
240, 15, 0, 1, 0.797
240, 0, 15, 0, 0.771
240, 0, 15, 1, 0.771
240, 15, 15, 0, 0.93
240, 15, 15, 1, 0.93
240, 2048, 0, 0, 0.909
240, 2048, 0, 1, 0.909
240, 2063, 0, 0, 0.797
240, 2063, 0, 1, 0.797
240, 2048, 15, 0, 0.771
240, 2048, 15, 1, 0.771
240, 2063, 15, 0, 0.93
240, 2063, 15, 1, 0.93
272, 0, 0, 0, 0.9
272, 0, 0, 1, 0.9
272, 17, 0, 0, 1.015
272, 17, 0, 1, 1.015
272, 0, 17, 0, 0.926
272, 0, 17, 1, 0.927
272, 17, 17, 0, 0.892
272, 17, 17, 1, 0.892
272, 2048, 0, 0, 0.9
272, 2048, 0, 1, 0.9
272, 2065, 0, 0, 1.015
272, 2065, 0, 1, 1.015
272, 2048, 17, 0, 0.927
272, 2048, 17, 1, 0.927
272, 2065, 17, 0, 0.878
272, 2065, 17, 1, 0.878
288, 0, 0, 0, 0.882
288, 0, 0, 1, 0.882
288, 18, 0, 0, 0.803
288, 18, 0, 1, 0.803
288, 0, 18, 0, 0.768
288, 0, 18, 1, 0.768
288, 18, 18, 0, 0.882
288, 18, 18, 1, 0.882
288, 2048, 0, 0, 0.882
288, 2048, 0, 1, 0.882
288, 2066, 0, 0, 0.803
288, 2066, 0, 1, 0.803
288, 2048, 18, 0, 0.768
288, 2048, 18, 1, 0.768
288, 2066, 18, 0, 0.882
288, 2066, 18, 1, 0.882
304, 0, 0, 0, 0.865
304, 0, 0, 1, 0.865
304, 19, 0, 0, 0.944
304, 19, 0, 1, 0.944
304, 0, 19, 0, 0.943
304, 0, 19, 1, 0.943
304, 19, 19, 0, 0.956
304, 19, 19, 1, 0.956
304, 2048, 0, 0, 0.866
304, 2048, 0, 1, 0.865
304, 2067, 0, 0, 0.944
304, 2067, 0, 1, 0.944
304, 2048, 19, 0, 0.943
304, 2048, 19, 1, 0.943
304, 2067, 19, 0, 0.947
304, 2067, 19, 1, 0.947
320, 0, 0, 0, 0.944
320, 0, 0, 1, 0.944
320, 20, 0, 0, 0.962
320, 20, 0, 1, 0.962
320, 0, 20, 0, 1.214
320, 0, 20, 1, 1.214
320, 20, 20, 0, 1.365
320, 20, 20, 1, 1.365
320, 2048, 0, 0, 0.943
320, 2048, 0, 1, 0.943
320, 2068, 0, 0, 0.962
320, 2068, 0, 1, 0.962
320, 2048, 20, 0, 0.914
320, 2048, 20, 1, 0.914
320, 2068, 20, 0, 1.365
320, 2068, 20, 1, 1.365
336, 0, 0, 0, 1.0
336, 0, 0, 1, 1.0
336, 21, 0, 0, 0.986
336, 21, 0, 1, 0.986
336, 0, 21, 0, 0.853
336, 0, 21, 1, 0.853
336, 21, 21, 0, 0.843
336, 21, 21, 1, 0.843
336, 2048, 0, 0, 1.0
336, 2048, 0, 1, 1.0
336, 2069, 0, 0, 0.986
336, 2069, 0, 1, 0.986
336, 2048, 21, 0, 0.853
336, 2048, 21, 1, 0.853
336, 2069, 21, 0, 0.831
336, 2069, 21, 1, 0.831
352, 0, 0, 0, 0.98
352, 0, 0, 1, 0.98
352, 22, 0, 0, 0.811
352, 22, 0, 1, 0.811
352, 0, 22, 0, 0.882
352, 0, 22, 1, 0.882
352, 22, 22, 0, 1.1
352, 22, 22, 1, 1.1
352, 2048, 0, 0, 0.98
352, 2048, 0, 1, 0.98
352, 2070, 0, 0, 0.811
352, 2070, 0, 1, 0.811
352, 2048, 22, 0, 0.882
352, 2048, 22, 1, 0.882
352, 2070, 22, 0, 1.1
352, 2070, 22, 1, 1.1
368, 0, 0, 0, 1.058
368, 0, 0, 1, 1.058
368, 23, 0, 0, 1.0
368, 23, 0, 1, 1.0
368, 0, 23, 0, 0.948
368, 0, 23, 1, 0.948
368, 23, 23, 0, 0.723
368, 23, 23, 1, 0.723
368, 2048, 0, 0, 1.058
368, 2048, 0, 1, 1.058
368, 2071, 0, 0, 1.0
368, 2071, 0, 1, 1.0
368, 2048, 23, 0, 0.948
368, 2048, 23, 1, 0.948
368, 2071, 23, 0, 0.701
368, 2071, 23, 1, 0.701
384, 0, 0, 0, 1.012
384, 0, 0, 1, 1.012
384, 24, 0, 0, 1.04
384, 24, 0, 1, 1.04
384, 0, 24, 0, 1.154
384, 0, 24, 1, 1.154
384, 24, 24, 0, 1.423
384, 24, 24, 1, 1.423
384, 2048, 0, 0, 1.012
384, 2048, 0, 1, 1.012
384, 2072, 0, 0, 1.04
384, 2072, 0, 1, 1.04
384, 2048, 24, 0, 0.91
384, 2048, 24, 1, 0.91
384, 2072, 24, 0, 1.423
384, 2072, 24, 1, 1.423
400, 0, 0, 0, 0.948
400, 0, 0, 1, 0.948
400, 25, 0, 0, 0.957
400, 25, 0, 1, 0.957
400, 0, 25, 0, 1.099
400, 0, 25, 1, 1.069
400, 25, 25, 0, 0.885
400, 25, 25, 1, 0.885
400, 2048, 0, 0, 0.948
400, 2048, 0, 1, 0.948
400, 2073, 0, 0, 0.957
400, 2073, 0, 1, 0.957
400, 2048, 25, 0, 0.94
400, 2048, 25, 1, 0.94
400, 2073, 25, 0, 0.908
400, 2073, 25, 1, 0.908
416, 0, 0, 0, 1.017
416, 0, 0, 1, 1.017
416, 26, 0, 0, 0.903
416, 26, 0, 1, 0.903
416, 0, 26, 0, 0.881
416, 0, 26, 1, 0.881
416, 26, 26, 0, 1.035
416, 26, 26, 1, 1.035
416, 2048, 0, 0, 1.017
416, 2048, 0, 1, 1.017
416, 2074, 0, 0, 0.903
416, 2074, 0, 1, 0.903
416, 2048, 26, 0, 0.881
416, 2048, 26, 1, 0.881
416, 2074, 26, 0, 1.034
416, 2074, 26, 1, 1.035
432, 0, 0, 0, 1.0
432, 0, 0, 1, 1.0
432, 27, 0, 0, 0.933
432, 27, 0, 1, 0.933
432, 0, 27, 0, 0.941
432, 0, 27, 1, 0.941
432, 27, 27, 0, 0.953
432, 27, 27, 1, 0.954
432, 2048, 0, 0, 1.0
432, 2048, 0, 1, 1.0
432, 2075, 0, 0, 0.933
432, 2075, 0, 1, 0.933
432, 2048, 27, 0, 0.941
432, 2048, 27, 1, 0.941
432, 2075, 27, 0, 0.93
432, 2075, 27, 1, 0.93
448, 0, 0, 0, 0.984
448, 0, 0, 1, 0.984
448, 28, 0, 0, 0.896
448, 28, 0, 1, 0.896
448, 0, 28, 0, 1.244
448, 0, 28, 1, 1.244
448, 28, 28, 0, 1.333
448, 28, 28, 1, 1.333
448, 2048, 0, 0, 0.984
448, 2048, 0, 1, 0.984
448, 2076, 0, 0, 0.896
448, 2076, 0, 1, 0.896
448, 2048, 28, 0, 0.988
448, 2048, 28, 1, 0.988
448, 2076, 28, 0, 1.333
448, 2076, 28, 1, 1.333
464, 0, 0, 0, 1.083
464, 0, 0, 1, 1.083
464, 29, 0, 0, 0.978
464, 29, 0, 1, 0.978
464, 0, 29, 0, 0.924
464, 0, 29, 1, 0.924
464, 29, 29, 0, 0.901
464, 29, 29, 1, 0.901
464, 2048, 0, 0, 1.083
464, 2048, 0, 1, 1.083
464, 2077, 0, 0, 0.978
464, 2077, 0, 1, 0.978
464, 2048, 29, 0, 0.924
464, 2048, 29, 1, 0.924
464, 2077, 29, 0, 0.89
464, 2077, 29, 1, 0.89
480, 0, 0, 0, 1.066
480, 0, 0, 1, 1.066
480, 30, 0, 0, 0.9
480, 30, 0, 1, 0.9
480, 0, 30, 0, 0.88
480, 0, 30, 1, 0.88
480, 30, 30, 0, 1.083
480, 30, 30, 1, 1.083
480, 2048, 0, 0, 1.066
480, 2048, 0, 1, 1.066
480, 2078, 0, 0, 0.9
480, 2078, 0, 1, 0.9
480, 2048, 30, 0, 0.88
480, 2048, 30, 1, 0.88
480, 2078, 30, 0, 1.083
480, 2078, 30, 1, 1.083
496, 0, 0, 0, 1.032
496, 0, 0, 1, 1.032
496, 31, 0, 0, 0.95
496, 31, 0, 1, 0.95
496, 0, 31, 0, 1.011
496, 0, 31, 1, 1.011
496, 31, 31, 0, 0.973
496, 31, 31, 1, 0.973
496, 2048, 0, 0, 1.032
496, 2048, 0, 1, 1.032
496, 2079, 0, 0, 0.95
496, 2079, 0, 1, 0.95
496, 2048, 31, 0, 1.011
496, 2048, 31, 1, 1.011
496, 2079, 31, 0, 0.941
496, 2079, 31, 1, 0.941
1024, 32, 0, 0, 1.143
1024, 32, 0, 1, 1.143
1024, 0, 32, 0, 1.143
1024, 0, 32, 1, 1.143
1024, 32, 32, 0, 1.143
1024, 32, 32, 1, 1.143
1024, 2080, 0, 0, 1.143
1024, 2080, 0, 1, 1.143
1024, 2048, 32, 0, 1.143
1024, 2048, 32, 1, 1.143
1024, 2080, 32, 0, 1.143
1024, 2080, 32, 1, 1.143
1056, 0, 0, 0, 1.168
1056, 0, 0, 1, 1.168
1056, 33, 0, 0, 1.067
1056, 33, 0, 1, 1.067
1056, 0, 33, 0, 0.977
1056, 0, 33, 1, 0.977
1056, 33, 33, 0, 1.043
1056, 33, 33, 1, 1.043
1056, 2048, 0, 0, 1.168
1056, 2048, 0, 1, 1.168
1056, 2081, 0, 0, 1.067
1056, 2081, 0, 1, 1.067
1056, 2048, 33, 0, 0.977
1056, 2048, 33, 1, 0.977
1056, 2081, 33, 0, 1.0
1056, 2081, 33, 1, 1.0
1088, 0, 0, 0, 1.171
1088, 0, 0, 1, 1.171
1088, 34, 0, 0, 1.041
1088, 34, 0, 1, 1.041
1088, 0, 34, 0, 1.079
1088, 0, 34, 1, 1.079
1088, 34, 34, 0, 0.966
1088, 34, 34, 1, 0.966
1088, 2048, 0, 0, 1.171
1088, 2048, 0, 1, 1.171
1088, 2082, 0, 0, 1.041
1088, 2082, 0, 1, 1.041
1088, 2048, 34, 0, 0.994
1088, 2048, 34, 1, 0.994
1088, 2082, 34, 0, 0.966
1088, 2082, 34, 1, 0.966
1120, 0, 0, 0, 1.152
1120, 0, 0, 1, 1.153
1120, 35, 0, 0, 1.051
1120, 35, 0, 1, 1.051
1120, 0, 35, 0, 1.0
1120, 0, 35, 1, 1.0
1120, 35, 35, 0, 1.068
1120, 35, 35, 1, 1.068
1120, 2048, 0, 0, 1.151
1120, 2048, 0, 1, 1.151
1120, 2083, 0, 0, 1.051
1120, 2083, 0, 1, 1.051
1120, 2048, 35, 0, 1.0
1120, 2048, 35, 1, 1.0
1120, 2083, 35, 0, 1.027
1120, 2083, 35, 1, 1.027
1152, 0, 0, 0, 1.159
1152, 0, 0, 1, 1.159
1152, 36, 0, 0, 1.034
1152, 36, 0, 1, 1.034
1152, 0, 36, 0, 1.07
1152, 0, 36, 1, 1.07
1152, 36, 36, 0, 0.967
1152, 36, 36, 1, 0.967
1152, 2048, 0, 0, 1.159
1152, 2048, 0, 1, 1.159
1152, 2084, 0, 0, 1.034
1152, 2084, 0, 1, 1.034
1152, 2048, 36, 0, 0.984
1152, 2048, 36, 1, 0.984
1152, 2084, 36, 0, 0.967
1152, 2084, 36, 1, 0.967
1184, 0, 0, 0, 1.157
1184, 0, 0, 1, 1.157
1184, 37, 0, 0, 1.067
1184, 37, 0, 1, 1.066
1184, 0, 37, 0, 0.993
1184, 0, 37, 1, 0.993
1184, 37, 37, 0, 1.08
1184, 37, 37, 1, 1.081
1184, 2048, 0, 0, 1.157
1184, 2048, 0, 1, 1.157
1184, 2085, 0, 0, 1.066
1184, 2085, 0, 1, 1.066
1184, 2048, 37, 0, 0.993
1184, 2048, 37, 1, 0.993
1184, 2085, 37, 0, 1.04
1184, 2085, 37, 1, 1.04
1216, 0, 0, 0, 1.139
1216, 0, 0, 1, 1.139
1216, 38, 0, 0, 1.024
1216, 38, 0, 1, 1.024
1216, 0, 38, 0, 1.087
1216, 0, 38, 1, 1.087
1216, 38, 38, 0, 1.0
1216, 38, 38, 1, 1.0
1216, 2048, 0, 0, 1.138
1216, 2048, 0, 1, 1.138
1216, 2086, 0, 0, 1.024
1216, 2086, 0, 1, 1.024
1216, 2048, 38, 0, 1.01
1216, 2048, 38, 1, 1.01
1216, 2086, 38, 0, 1.0
1216, 2086, 38, 1, 1.0
1248, 0, 0, 0, 1.176
1248, 0, 0, 1, 1.174
1248, 39, 0, 0, 1.074
1248, 39, 0, 1, 1.074
1248, 0, 39, 0, 0.966
1248, 0, 39, 1, 0.985
1248, 39, 39, 0, 1.064
1248, 39, 39, 1, 1.064
1248, 2048, 0, 0, 1.179
1248, 2048, 0, 1, 1.179
1248, 2087, 0, 0, 1.074
1248, 2087, 0, 1, 1.074
1248, 2048, 39, 0, 0.985
1248, 2048, 39, 1, 0.985
1248, 2087, 39, 0, 1.026
1248, 2087, 39, 1, 1.026
1280, 0, 0, 0, 0.993
1280, 0, 0, 1, 0.993
1280, 40, 0, 0, 1.051
1280, 40, 0, 1, 1.051
1280, 0, 40, 0, 1.044
1280, 0, 40, 1, 1.045
1280, 40, 40, 0, 1.25
1280, 40, 40, 1, 1.25
1280, 2048, 0, 0, 0.992
1280, 2048, 0, 1, 0.992
1280, 2088, 0, 0, 1.051
1280, 2088, 0, 1, 1.051
1280, 2048, 40, 0, 0.946
1280, 2048, 40, 1, 0.946
1280, 2088, 40, 0, 1.252
1280, 2088, 40, 1, 1.252
1312, 0, 0, 0, 0.969
1312, 0, 0, 1, 0.969
1312, 41, 0, 0, 0.991
1312, 41, 0, 1, 0.991
1312, 0, 41, 0, 0.837
1312, 0, 41, 1, 0.837
1312, 41, 41, 0, 1.025
1312, 41, 41, 1, 1.025
1312, 2048, 0, 0, 0.969
1312, 2048, 0, 1, 0.969
1312, 2089, 0, 0, 0.991
1312, 2089, 0, 1, 0.99
1312, 2048, 41, 0, 0.837
1312, 2048, 41, 1, 0.837
1312, 2089, 41, 0, 0.975
1312, 2089, 41, 1, 0.975
1344, 0, 0, 0, 0.988
1344, 0, 0, 1, 0.988
1344, 42, 0, 0, 1.031
1344, 42, 0, 1, 1.031
1344, 0, 42, 0, 1.033
1344, 0, 42, 1, 1.033
1344, 42, 42, 0, 0.982
1344, 42, 42, 1, 0.982
1344, 2048, 0, 0, 0.992
1344, 2048, 0, 1, 0.992
1344, 2090, 0, 0, 1.031
1344, 2090, 0, 1, 1.031
1344, 2048, 42, 0, 0.943
1344, 2048, 42, 1, 0.942
1344, 2090, 42, 0, 0.982
1344, 2090, 42, 1, 0.982
1376, 0, 0, 0, 1.016
1376, 0, 0, 1, 1.016
1376, 43, 0, 0, 1.01
1376, 43, 0, 1, 1.01
1376, 0, 43, 0, 0.829
1376, 0, 43, 1, 0.829
1376, 43, 43, 0, 1.024
1376, 43, 43, 1, 1.024
1376, 2048, 0, 0, 1.006
1376, 2048, 0, 1, 1.015
1376, 2091, 0, 0, 1.01
1376, 2091, 0, 1, 1.01
1376, 2048, 43, 0, 0.829
1376, 2048, 43, 1, 0.829
1376, 2091, 43, 0, 0.98
1376, 2091, 43, 1, 0.98
1408, 0, 0, 0, 0.987
1408, 0, 0, 1, 0.987
1408, 44, 0, 0, 1.015
1408, 44, 0, 1, 1.015
1408, 0, 44, 0, 1.018
1408, 0, 44, 1, 1.014
1408, 44, 44, 0, 1.004
1408, 44, 44, 1, 0.994
1408, 2048, 0, 0, 0.988
1408, 2048, 0, 1, 0.988
1408, 2092, 0, 0, 1.015
1408, 2092, 0, 1, 1.015
1408, 2048, 44, 0, 0.955
1408, 2048, 44, 1, 0.955
1408, 2092, 44, 0, 1.0
1408, 2092, 44, 1, 0.994
1440, 0, 0, 0, 0.986
1440, 0, 0, 1, 0.986
1440, 45, 0, 0, 1.013
1440, 45, 0, 1, 1.013
1440, 0, 45, 0, 0.814
1440, 0, 45, 1, 0.814
1440, 45, 45, 0, 1.006
1440, 45, 45, 1, 1.006
1440, 2048, 0, 0, 0.986
1440, 2048, 0, 1, 0.986
1440, 2093, 0, 0, 1.013
1440, 2093, 0, 1, 1.013
1440, 2048, 45, 0, 0.814
1440, 2048, 45, 1, 0.814
1440, 2093, 45, 0, 0.966
1440, 2093, 45, 1, 0.966
1472, 0, 0, 0, 0.997
1472, 0, 0, 1, 0.994
1472, 46, 0, 0, 1.045
1472, 46, 0, 1, 1.045
1472, 0, 46, 0, 1.026
1472, 0, 46, 1, 1.026
1472, 46, 46, 0, 0.966
1472, 46, 46, 1, 0.966
1472, 2048, 0, 0, 1.0
1472, 2048, 0, 1, 0.996
1472, 2094, 0, 0, 1.045
1472, 2094, 0, 1, 1.045
1472, 2048, 46, 0, 0.939
1472, 2048, 46, 1, 0.939
1472, 2094, 46, 0, 0.966
1472, 2094, 46, 1, 0.966
1504, 0, 0, 0, 0.993
1504, 0, 0, 1, 0.993
1504, 47, 0, 0, 0.999
1504, 47, 0, 1, 0.999
1504, 0, 47, 0, 0.826
1504, 0, 47, 1, 0.826
1504, 47, 47, 0, 1.023
1504, 47, 47, 1, 1.023
1504, 2048, 0, 0, 0.993
1504, 2048, 0, 1, 0.993
1504, 2095, 0, 0, 0.999
1504, 2095, 0, 1, 0.999
1504, 2048, 47, 0, 0.826
1504, 2048, 47, 1, 0.826
1504, 2095, 47, 0, 0.993
1504, 2095, 47, 1, 0.993
1536, 0, 0, 0, 0.992
1536, 0, 0, 1, 0.991
1536, 48, 0, 0, 1.019
1536, 48, 0, 1, 1.019
1536, 0, 48, 0, 1.025
1536, 0, 48, 1, 1.024
1536, 48, 48, 0, 0.994
1536, 48, 48, 1, 0.994
1536, 2048, 0, 0, 0.994
1536, 2048, 0, 1, 0.994
1536, 2096, 0, 0, 1.019
1536, 2096, 0, 1, 1.019
1536, 2048, 48, 0, 1.025
1536, 2048, 48, 1, 1.025
1536, 2096, 48, 0, 0.994
1536, 2096, 48, 1, 0.994
1568, 0, 0, 0, 0.994
1568, 0, 0, 1, 0.994
1568, 49, 0, 0, 0.903
1568, 49, 0, 1, 0.903
1568, 0, 49, 0, 1.144
1568, 0, 49, 1, 1.144
1568, 49, 49, 0, 1.461
1568, 49, 49, 1, 1.461
1568, 2048, 0, 0, 0.993
1568, 2048, 0, 1, 0.993
1568, 2097, 0, 0, 0.903
1568, 2097, 0, 1, 0.903
1568, 2048, 49, 0, 1.09
1568, 2048, 49, 1, 1.09
1568, 2097, 49, 0, 1.46
1568, 2097, 49, 1, 1.46
1600, 0, 0, 0, 0.981
1600, 0, 0, 1, 0.981
1600, 50, 0, 0, 1.022
1600, 50, 0, 1, 1.022
1600, 0, 50, 0, 1.017
1600, 0, 50, 1, 1.017
1600, 50, 50, 0, 0.973
1600, 50, 50, 1, 0.973
1600, 2048, 0, 0, 0.981
1600, 2048, 0, 1, 0.981
1600, 2098, 0, 0, 1.022
1600, 2098, 0, 1, 1.022
1600, 2048, 50, 0, 0.961
1600, 2048, 50, 1, 0.961
1600, 2098, 50, 0, 0.973
1600, 2098, 50, 1, 0.973
1632, 0, 0, 0, 1.019
1632, 0, 0, 1, 1.019
1632, 51, 0, 0, 0.893
1632, 51, 0, 1, 0.893
1632, 0, 51, 0, 1.131
1632, 0, 51, 1, 1.131
1632, 51, 51, 0, 1.444
1632, 51, 51, 1, 1.444
1632, 2048, 0, 0, 1.019
1632, 2048, 0, 1, 1.019
1632, 2099, 0, 0, 0.893
1632, 2099, 0, 1, 0.893
1632, 2048, 51, 0, 1.079
1632, 2048, 51, 1, 1.079
1632, 2099, 51, 0, 1.449
1632, 2099, 51, 1, 1.449
1664, 0, 0, 0, 1.005
1664, 0, 0, 1, 1.004
1664, 52, 0, 0, 0.986
1664, 52, 0, 1, 0.986
1664, 0, 52, 0, 1.004
1664, 0, 52, 1, 1.004
1664, 52, 52, 0, 0.976
1664, 52, 52, 1, 0.976
1664, 2048, 0, 0, 1.006
1664, 2048, 0, 1, 1.006
1664, 2100, 0, 0, 0.993
1664, 2100, 0, 1, 0.993
1664, 2048, 52, 0, 0.946
1664, 2048, 52, 1, 0.946
1664, 2100, 52, 0, 0.976
1664, 2100, 52, 1, 0.976
1696, 0, 0, 0, 0.994
1696, 0, 0, 1, 0.992
1696, 53, 0, 0, 0.884
1696, 53, 0, 1, 0.884
1696, 0, 53, 0, 1.141
1696, 0, 53, 1, 1.141
1696, 53, 53, 0, 1.43
1696, 53, 53, 1, 1.43
1696, 2048, 0, 0, 0.994
1696, 2048, 0, 1, 0.994
1696, 2101, 0, 0, 0.884
1696, 2101, 0, 1, 0.884
1696, 2048, 53, 0, 1.088
1696, 2048, 53, 1, 1.088
1696, 2101, 53, 0, 1.429
1696, 2101, 53, 1, 1.429
1728, 0, 0, 0, 0.978
1728, 0, 0, 1, 0.978
1728, 54, 0, 0, 1.031
1728, 54, 0, 1, 1.033
1728, 0, 54, 0, 1.0
1728, 0, 54, 1, 1.0
1728, 54, 54, 0, 0.96
1728, 54, 54, 1, 0.96
1728, 2048, 0, 0, 0.976
1728, 2048, 0, 1, 0.976
1728, 2102, 0, 0, 1.033
1728, 2102, 0, 1, 1.033
1728, 2048, 54, 0, 0.947
1728, 2048, 54, 1, 0.947
1728, 2102, 54, 0, 0.96
1728, 2102, 54, 1, 0.96
1760, 0, 0, 0, 1.019
1760, 0, 0, 1, 1.021
1760, 55, 0, 0, 0.9
1760, 55, 0, 1, 0.9
1760, 0, 55, 0, 1.125
1760, 0, 55, 1, 1.125
1760, 55, 55, 0, 1.437
1760, 55, 55, 1, 1.436
1760, 2048, 0, 0, 1.016
1760, 2048, 0, 1, 1.015
1760, 2103, 0, 0, 0.9
1760, 2103, 0, 1, 0.9
1760, 2048, 55, 0, 1.073
1760, 2048, 55, 1, 1.074
1760, 2103, 55, 0, 1.44
1760, 2103, 55, 1, 1.44
1792, 0, 0, 0, 1.002
1792, 0, 0, 1, 1.002
1792, 56, 0, 0, 1.028
1792, 56, 0, 1, 1.028
1792, 0, 56, 0, 1.014
1792, 0, 56, 1, 1.015
1792, 56, 56, 0, 1.191
1792, 56, 56, 1, 1.191
1792, 2048, 0, 0, 1.003
1792, 2048, 0, 1, 1.003
1792, 2104, 0, 0, 1.028
1792, 2104, 0, 1, 1.028
1792, 2048, 56, 0, 0.963
1792, 2048, 56, 1, 0.963
1792, 2104, 56, 0, 1.191
1792, 2104, 56, 1, 1.191
1824, 0, 0, 0, 0.999
1824, 0, 0, 1, 1.0
1824, 57, 0, 0, 0.891
1824, 57, 0, 1, 0.891
1824, 0, 57, 0, 1.114
1824, 0, 57, 1, 1.114
1824, 57, 57, 0, 1.407
1824, 57, 57, 1, 1.407
1824, 2048, 0, 0, 1.001
1824, 2048, 0, 1, 1.001
1824, 2105, 0, 0, 0.891
1824, 2105, 0, 1, 0.891
1824, 2048, 57, 0, 1.064
1824, 2048, 57, 1, 1.064
1824, 2105, 57, 0, 1.407
1824, 2105, 57, 1, 1.407
1856, 0, 0, 0, 0.989
1856, 0, 0, 1, 0.987
1856, 58, 0, 0, 1.042
1856, 58, 0, 1, 1.042
1856, 0, 58, 0, 1.007
1856, 0, 58, 1, 1.007
1856, 58, 58, 0, 0.978
1856, 58, 58, 1, 0.972
1856, 2048, 0, 0, 0.992
1856, 2048, 0, 1, 0.992
1856, 2106, 0, 0, 1.042
1856, 2106, 0, 1, 1.042
1856, 2048, 58, 0, 0.954
1856, 2048, 58, 1, 0.954
1856, 2106, 58, 0, 0.979
1856, 2106, 58, 1, 0.972
1888, 0, 0, 0, 0.994
1888, 0, 0, 1, 0.994
1888, 59, 0, 0, 0.883
1888, 59, 0, 1, 0.883
1888, 0, 59, 0, 1.121
1888, 0, 59, 1, 1.123
1888, 59, 59, 0, 1.413
1888, 59, 59, 1, 1.413
1888, 2048, 0, 0, 0.985
1888, 2048, 0, 1, 0.994
1888, 2107, 0, 0, 0.883
1888, 2107, 0, 1, 0.883
1888, 2048, 59, 0, 1.076
1888, 2048, 59, 1, 1.076
1888, 2107, 59, 0, 1.413
1888, 2107, 59, 1, 1.413
1920, 0, 0, 0, 1.0
1920, 0, 0, 1, 0.999
1920, 60, 0, 0, 1.033
1920, 60, 0, 1, 1.033
1920, 0, 60, 0, 0.996
1920, 0, 60, 1, 0.997
1920, 60, 60, 0, 0.968
1920, 60, 60, 1, 0.968
1920, 2048, 0, 0, 1.0
1920, 2048, 0, 1, 1.0
1920, 2108, 0, 0, 1.034
1920, 2108, 0, 1, 1.034
1920, 2048, 60, 0, 0.949
1920, 2048, 60, 1, 0.949
1920, 2108, 60, 0, 0.968
1920, 2108, 60, 1, 0.968
1952, 0, 0, 0, 1.004
1952, 0, 0, 1, 1.004
1952, 61, 0, 0, 0.898
1952, 61, 0, 1, 0.898
1952, 0, 61, 0, 1.118
1952, 0, 61, 1, 1.118
1952, 61, 61, 0, 1.387
1952, 61, 61, 1, 1.387
1952, 2048, 0, 0, 1.004
1952, 2048, 0, 1, 1.004
1952, 2109, 0, 0, 0.898
1952, 2109, 0, 1, 0.898
1952, 2048, 61, 0, 1.071
1952, 2048, 61, 1, 1.071
1952, 2109, 61, 0, 1.387
1952, 2109, 61, 1, 1.387
1984, 0, 0, 0, 0.993
1984, 0, 0, 1, 0.993
1984, 62, 0, 0, 1.025
1984, 62, 0, 1, 1.025
1984, 0, 62, 0, 1.005
1984, 0, 62, 1, 1.007
1984, 62, 62, 0, 0.982
1984, 62, 62, 1, 0.982
1984, 2048, 0, 0, 0.993
1984, 2048, 0, 1, 0.993
1984, 2110, 0, 0, 1.025
1984, 2110, 0, 1, 1.025
1984, 2048, 62, 0, 0.96
1984, 2048, 62, 1, 0.96
1984, 2110, 62, 0, 0.982
1984, 2110, 62, 1, 0.982
2016, 0, 0, 0, 1.0
2016, 0, 0, 1, 0.999
2016, 63, 0, 0, 0.889
2016, 63, 0, 1, 0.89
2016, 0, 63, 0, 1.091
2016, 0, 63, 1, 1.092
2016, 63, 63, 0, 1.362
2016, 63, 63, 1, 1.363
2016, 2048, 0, 0, 1.0
2016, 2048, 0, 1, 1.0
2016, 2111, 0, 0, 0.965
2016, 2111, 0, 1, 0.965
2016, 2048, 63, 0, 1.049
2016, 2048, 63, 1, 1.049
2016, 2111, 63, 0, 1.405
2016, 2111, 63, 1, 1.405
2048, 32, 0, 0, 1.01
2048, 32, 0, 1, 1.01
2048, 0, 32, 0, 1.005
2048, 0, 32, 1, 1.005
2048, 32, 32, 0, 1.005
2048, 32, 32, 1, 1.005
2048, 0, 1, 0, 0.983
2048, 0, 1, 1, 0.984
2048, 1, 0, 0, 1.039
2048, 1, 0, 1, 1.039
2048, 32, 1, 0, 1.063
2048, 32, 1, 1, 1.063
2048, 1, 32, 0, 0.94
2048, 1, 32, 1, 0.94
2048, 2048, 1, 0, 0.981
2048, 2048, 1, 1, 0.981
2048, 2049, 0, 0, 0.904
2048, 2049, 0, 1, 0.904
2112, 0, 0, 0, 0.996
2112, 0, 0, 1, 0.995
2112, 1, 0, 0, 1.031
2112, 1, 0, 1, 1.031
2112, 33, 0, 0, 1.01
2112, 33, 0, 1, 1.01
2112, 0, 1, 0, 0.972
2112, 0, 1, 1, 0.972
2112, 0, 33, 0, 0.987
2112, 0, 33, 1, 0.987
2112, 1, 1, 0, 0.914
2112, 1, 1, 1, 0.914
2112, 33, 33, 0, 0.983
2112, 33, 33, 1, 0.983
2112, 2048, 0, 0, 0.994
2112, 2048, 0, 1, 0.99
2112, 2049, 0, 0, 1.031
2112, 2049, 0, 1, 1.031
2112, 2048, 1, 0, 0.955
2112, 2048, 1, 1, 0.955
2112, 2049, 1, 0, 0.906
2112, 2049, 1, 1, 0.906
2112, 33, 1, 0, 1.163
2112, 33, 1, 1, 1.164
2112, 1, 33, 0, 1.046
2112, 1, 33, 1, 1.046
2176, 0, 0, 0, 0.984
2176, 0, 0, 1, 0.985
2176, 2, 0, 0, 1.023
2176, 2, 0, 1, 1.023
2176, 34, 0, 0, 1.0
2176, 34, 0, 1, 1.0
2176, 0, 2, 0, 0.985
2176, 0, 2, 1, 0.985
2176, 0, 34, 0, 0.995
2176, 0, 34, 1, 0.982
2176, 2, 2, 0, 0.928
2176, 2, 2, 1, 0.928
2176, 34, 34, 0, 1.004
2176, 34, 34, 1, 1.004
2176, 2048, 0, 0, 0.985
2176, 2048, 0, 1, 0.986
2176, 2050, 0, 0, 1.023
2176, 2050, 0, 1, 1.023
2176, 2048, 2, 0, 0.802
2176, 2048, 2, 1, 0.802
2176, 2050, 2, 0, 0.894
2176, 2050, 2, 1, 0.894
2176, 2, 1, 0, 1.068
2176, 2, 1, 1, 1.068
2176, 1, 2, 0, 0.976
2176, 1, 2, 1, 0.976
2176, 34, 1, 0, 1.077
2176, 34, 1, 1, 1.077
2176, 1, 34, 0, 0.978
2176, 1, 34, 1, 0.978
2176, 2050, 1, 0, 1.061
2176, 2050, 1, 1, 1.061
2176, 2049, 2, 0, 0.971
2176, 2049, 2, 1, 0.971
2240, 0, 0, 0, 0.994
2240, 0, 0, 1, 0.994
2240, 3, 0, 0, 1.038
2240, 3, 0, 1, 1.039
2240, 35, 0, 0, 1.019
2240, 35, 0, 1, 1.019
2240, 0, 3, 0, 0.979
2240, 0, 3, 1, 0.98
2240, 0, 35, 0, 0.991
2240, 0, 35, 1, 0.991
2240, 3, 3, 0, 0.931
2240, 3, 3, 1, 0.931
2240, 35, 35, 0, 0.999
2240, 35, 35, 1, 0.999
2240, 2048, 0, 0, 0.995
2240, 2048, 0, 1, 0.995
2240, 2051, 0, 0, 1.039
2240, 2051, 0, 1, 1.039
2240, 2048, 3, 0, 0.799
2240, 2048, 3, 1, 0.799
2240, 2051, 3, 0, 0.889
2240, 2051, 3, 1, 0.889
2240, 3, 1, 0, 1.06
2240, 3, 1, 1, 1.06
2240, 1, 3, 0, 0.968
2240, 1, 3, 1, 0.968
2240, 35, 1, 0, 1.071
2240, 35, 1, 1, 1.071
2240, 1, 35, 0, 0.971
2240, 1, 35, 1, 0.971
2240, 2051, 1, 0, 1.057
2240, 2051, 1, 1, 1.057
2240, 2049, 3, 0, 0.966
2240, 2049, 3, 1, 0.966
2304, 0, 0, 0, 0.986
2304, 0, 0, 1, 0.986
2304, 4, 0, 0, 1.031
2304, 4, 0, 1, 1.032
2304, 36, 0, 0, 1.011
2304, 36, 0, 1, 1.011
2304, 0, 4, 0, 0.968
2304, 0, 4, 1, 0.969
2304, 0, 36, 0, 0.988
2304, 0, 36, 1, 0.988
2304, 4, 4, 0, 0.93
2304, 4, 4, 1, 0.931
2304, 36, 36, 0, 0.992
2304, 36, 36, 1, 0.992
2304, 2048, 0, 0, 0.988
2304, 2048, 0, 1, 0.988
2304, 2052, 0, 0, 1.032
2304, 2052, 0, 1, 1.032
2304, 2048, 4, 0, 0.793
2304, 2048, 4, 1, 0.793
2304, 2052, 4, 0, 0.884
2304, 2052, 4, 1, 0.884
2304, 4, 1, 0, 0.989
2304, 4, 1, 1, 0.989
2304, 1, 4, 0, 0.897
2304, 1, 4, 1, 0.898
2304, 36, 1, 0, 1.057
2304, 36, 1, 1, 1.057
2304, 1, 36, 0, 0.966
2304, 1, 36, 1, 0.966
2304, 2052, 1, 0, 1.052
2304, 2052, 1, 1, 1.052
2304, 2049, 4, 0, 0.955
2304, 2049, 4, 1, 0.955
2368, 0, 0, 0, 1.0
2368, 0, 0, 1, 1.001
2368, 5, 0, 0, 1.024
2368, 5, 0, 1, 1.025
2368, 37, 0, 0, 1.0
2368, 37, 0, 1, 1.0
2368, 0, 5, 0, 0.98
2368, 0, 5, 1, 0.981
2368, 0, 37, 0, 0.983
2368, 0, 37, 1, 0.98
2368, 5, 5, 0, 0.944
2368, 5, 5, 1, 0.944
2368, 37, 37, 0, 1.003
2368, 37, 37, 1, 1.003
2368, 2048, 0, 0, 1.002
2368, 2048, 0, 1, 1.002
2368, 2053, 0, 0, 1.025
2368, 2053, 0, 1, 1.025
2368, 2048, 5, 0, 0.801
2368, 2048, 5, 1, 0.801
2368, 2053, 5, 0, 0.907
2368, 2053, 5, 1, 0.907
2368, 5, 1, 0, 1.071
2368, 5, 1, 1, 1.071
2368, 1, 5, 0, 0.973
2368, 1, 5, 1, 0.973
2368, 37, 1, 0, 1.07
2368, 37, 1, 1, 1.07
2368, 1, 37, 0, 0.974
2368, 1, 37, 1, 0.974
2368, 2053, 1, 0, 1.065
2368, 2053, 1, 1, 1.065
2368, 2049, 5, 0, 0.967
2368, 2049, 5, 1, 0.967
2432, 0, 0, 0, 0.965
2432, 0, 0, 1, 1.0
2432, 6, 0, 0, 1.038
2432, 6, 0, 1, 1.039
2432, 38, 0, 0, 1.021
2432, 38, 0, 1, 1.021
2432, 0, 6, 0, 0.974
2432, 0, 6, 1, 0.976
2432, 0, 38, 0, 0.986
2432, 0, 38, 1, 0.986
2432, 6, 6, 0, 0.926
2432, 6, 6, 1, 0.926
2432, 38, 38, 0, 1.0
2432, 38, 38, 1, 1.0
2432, 2048, 0, 0, 1.004
2432, 2048, 0, 1, 1.004
2432, 2054, 0, 0, 1.039
2432, 2054, 0, 1, 1.039
2432, 2048, 6, 0, 0.797
2432, 2048, 6, 1, 0.797
2432, 2054, 6, 0, 0.898
2432, 2054, 6, 1, 0.898
2432, 6, 1, 0, 1.063
2432, 6, 1, 1, 1.063
2432, 1, 6, 0, 0.965
2432, 1, 6, 1, 0.965
2432, 38, 1, 0, 1.068
2432, 38, 1, 1, 1.068
2432, 1, 38, 0, 0.968
2432, 1, 38, 1, 0.968
2432, 2054, 1, 0, 1.06
2432, 2054, 1, 1, 1.06
2432, 2049, 6, 0, 0.963
2432, 2049, 6, 1, 0.963
2496, 0, 0, 0, 1.013
2496, 0, 0, 1, 1.013
2496, 7, 0, 0, 1.032
2496, 7, 0, 1, 1.032
2496, 39, 0, 0, 1.013
2496, 39, 0, 1, 1.013
2496, 0, 7, 0, 0.965
2496, 0, 7, 1, 0.965
2496, 0, 39, 0, 0.979
2496, 0, 39, 1, 0.979
2496, 7, 7, 0, 0.925
2496, 7, 7, 1, 0.925
2496, 39, 39, 0, 0.989
2496, 39, 39, 1, 0.989
2496, 2048, 0, 0, 1.013
2496, 2048, 0, 1, 1.013
2496, 2055, 0, 0, 1.032
2496, 2055, 0, 1, 1.032
2496, 2048, 7, 0, 0.792
2496, 2048, 7, 1, 0.792
2496, 2055, 7, 0, 0.93
2496, 2055, 7, 1, 0.93
2496, 7, 1, 0, 0.984
2496, 7, 1, 1, 0.984
2496, 1, 7, 0, 0.894
2496, 1, 7, 1, 0.895
2496, 39, 1, 0, 1.054
2496, 39, 1, 1, 1.054
2496, 1, 39, 0, 0.963
2496, 1, 39, 1, 0.963
2496, 2055, 1, 0, 1.049
2496, 2055, 1, 1, 1.049
2496, 2049, 7, 0, 0.953
2496, 2049, 7, 1, 0.953
2560, 0, 0, 0, 0.991
2560, 0, 0, 1, 0.991
2560, 8, 0, 0, 1.031
2560, 8, 0, 1, 1.032
2560, 40, 0, 0, 1.029
2560, 40, 0, 1, 1.029
2560, 0, 8, 0, 0.992
2560, 0, 8, 1, 0.992
2560, 0, 40, 0, 0.975
2560, 0, 40, 1, 0.984
2560, 8, 8, 0, 0.942
2560, 8, 8, 1, 0.943
2560, 40, 40, 0, 1.139
2560, 40, 40, 1, 1.139
2560, 2048, 0, 0, 0.993
2560, 2048, 0, 1, 0.993
2560, 2056, 0, 0, 1.032
2560, 2056, 0, 1, 1.032
2560, 2048, 8, 0, 0.812
2560, 2048, 8, 1, 0.812
2560, 2056, 8, 0, 0.912
2560, 2056, 8, 1, 0.912
2560, 8, 1, 0, 1.068
2560, 8, 1, 1, 1.069
2560, 1, 8, 0, 0.974
2560, 1, 8, 1, 0.974
2560, 40, 1, 0, 1.068
2560, 40, 1, 1, 1.068
2560, 1, 40, 0, 0.996
2560, 1, 40, 1, 0.996
2560, 2056, 1, 0, 1.063
2560, 2056, 1, 1, 1.063
2560, 2049, 8, 0, 0.969
2560, 2049, 8, 1, 0.969
2624, 0, 0, 0, 0.995
2624, 0, 0, 1, 0.994
2624, 9, 0, 0, 1.015
2624, 9, 0, 1, 1.018
2624, 41, 0, 0, 1.044
2624, 41, 0, 1, 1.044
2624, 0, 9, 0, 0.988
2624, 0, 9, 1, 0.99
2624, 0, 41, 0, 0.989
2624, 0, 41, 1, 0.99
2624, 9, 9, 0, 0.943
2624, 9, 9, 1, 0.943
2624, 41, 41, 0, 0.993
2624, 41, 41, 1, 0.993
2624, 2048, 0, 0, 0.998
2624, 2048, 0, 1, 0.998
2624, 2057, 0, 0, 1.018
2624, 2057, 0, 1, 1.018
2624, 2048, 9, 0, 0.81
2624, 2048, 9, 1, 0.81
2624, 2057, 9, 0, 0.907
2624, 2057, 9, 1, 0.907
2624, 9, 1, 0, 1.09
2624, 9, 1, 1, 1.09
2624, 1, 9, 0, 0.967
2624, 1, 9, 1, 0.967
2624, 41, 1, 0, 1.084
2624, 41, 1, 1, 1.085
2624, 1, 41, 0, 0.958
2624, 1, 41, 1, 0.957
2624, 2057, 1, 0, 1.087
2624, 2057, 1, 1, 1.087
2624, 2049, 9, 0, 0.965
2624, 2049, 9, 1, 0.965
2688, 0, 0, 0, 0.995
2688, 0, 0, 1, 0.995
2688, 10, 0, 0, 1.01
2688, 10, 0, 1, 1.012
2688, 42, 0, 0, 1.036
2688, 42, 0, 1, 1.036
2688, 0, 10, 0, 0.978
2688, 0, 10, 1, 0.979
2688, 0, 42, 0, 0.977
2688, 0, 42, 1, 0.978
2688, 10, 10, 0, 0.942
2688, 10, 10, 1, 0.942
2688, 42, 42, 0, 0.989
2688, 42, 42, 1, 0.989
2688, 2048, 0, 0, 0.995
2688, 2048, 0, 1, 0.995
2688, 2058, 0, 0, 1.012
2688, 2058, 0, 1, 1.012
2688, 2048, 10, 0, 0.804
2688, 2048, 10, 1, 0.804
2688, 2058, 10, 0, 0.905
2688, 2058, 10, 1, 0.905
2688, 10, 1, 0, 0.986
2688, 10, 1, 1, 0.987
2688, 1, 10, 0, 0.893
2688, 1, 10, 1, 0.894
2688, 42, 1, 0, 1.054
2688, 42, 1, 1, 1.054
2688, 1, 42, 0, 0.958
2688, 1, 42, 1, 0.958
2688, 2058, 1, 0, 1.052
2688, 2058, 1, 1, 1.052
2688, 2049, 10, 0, 0.954
2688, 2049, 10, 1, 0.954
2752, 0, 0, 0, 1.0
2752, 0, 0, 1, 0.992
2752, 11, 0, 0, 0.954
2752, 11, 0, 1, 0.954
2752, 43, 0, 0, 0.979
2752, 43, 0, 1, 0.979
2752, 0, 11, 0, 0.939
2752, 0, 11, 1, 0.939
2752, 0, 43, 0, 0.931
2752, 0, 43, 1, 0.932
2752, 11, 11, 0, 0.949
2752, 11, 11, 1, 0.949
2752, 43, 43, 0, 1.007
2752, 43, 43, 1, 1.007
2752, 2048, 0, 0, 0.993
2752, 2048, 0, 1, 0.993
2752, 2059, 0, 0, 0.954
2752, 2059, 0, 1, 0.954
2752, 2048, 11, 0, 0.77
2752, 2048, 11, 1, 0.77
2752, 2059, 11, 0, 0.916
2752, 2059, 11, 1, 0.916
2752, 11, 1, 0, 0.994
2752, 11, 1, 1, 0.994
2752, 1, 11, 0, 0.928
2752, 1, 11, 1, 0.928
2752, 43, 1, 0, 1.022
2752, 43, 1, 1, 1.022
2752, 1, 43, 0, 0.92
2752, 1, 43, 1, 0.92
2752, 2059, 1, 0, 0.989
2752, 2059, 1, 1, 0.989
2752, 2049, 11, 0, 0.923
2752, 2049, 11, 1, 0.923
2816, 0, 0, 0, 1.003
2816, 0, 0, 1, 1.003
2816, 12, 0, 0, 0.897
2816, 12, 0, 1, 0.894
2816, 44, 0, 0, 0.914
2816, 44, 0, 1, 0.914
2816, 0, 12, 0, 0.876
2816, 0, 12, 1, 0.874
2816, 0, 44, 0, 0.871
2816, 0, 44, 1, 0.87
2816, 12, 12, 0, 0.948
2816, 12, 12, 1, 0.948
2816, 44, 44, 0, 1.009
2816, 44, 44, 1, 1.009
2816, 2048, 0, 0, 1.005
2816, 2048, 0, 1, 1.005
2816, 2060, 0, 0, 0.894
2816, 2060, 0, 1, 0.894
2816, 2048, 12, 0, 0.714
2816, 2048, 12, 1, 0.713
2816, 2060, 12, 0, 0.915
2816, 2060, 12, 1, 0.915
2816, 12, 1, 0, 0.917
2816, 12, 1, 1, 0.917
2816, 1, 12, 0, 0.858
2816, 1, 12, 1, 0.857
2816, 44, 1, 0, 0.944
2816, 44, 1, 1, 0.943
2816, 1, 44, 0, 0.856
2816, 1, 44, 1, 0.856
2816, 2060, 1, 0, 0.914
2816, 2060, 1, 1, 0.914
2816, 2049, 12, 0, 0.855
2816, 2049, 12, 1, 0.855
2880, 0, 0, 0, 0.989
2880, 0, 0, 1, 0.989
2880, 13, 0, 0, 0.967
2880, 13, 0, 1, 0.967
2880, 45, 0, 0, 0.987
2880, 45, 0, 1, 0.987
2880, 0, 13, 0, 0.925
2880, 0, 13, 1, 0.925
2880, 0, 45, 0, 0.927
2880, 0, 45, 1, 0.927
2880, 13, 13, 0, 0.944
2880, 13, 13, 1, 0.944
2880, 45, 45, 0, 1.003
2880, 45, 45, 1, 1.003
2880, 2048, 0, 0, 0.989
2880, 2048, 0, 1, 0.989
2880, 2061, 0, 0, 0.967
2880, 2061, 0, 1, 0.967
2880, 2048, 13, 0, 0.76
2880, 2048, 13, 1, 0.76
2880, 2061, 13, 0, 0.91
2880, 2061, 13, 1, 0.91
2880, 13, 1, 0, 0.922
2880, 13, 1, 1, 0.922
2880, 1, 13, 0, 0.859
2880, 1, 13, 1, 0.859
2880, 45, 1, 0, 1.013
2880, 45, 1, 1, 1.013
2880, 1, 45, 0, 0.92
2880, 1, 45, 1, 0.92
2880, 2061, 1, 0, 0.984
2880, 2061, 1, 1, 0.984
2880, 2049, 13, 0, 0.918
2880, 2049, 13, 1, 0.918
2944, 0, 0, 0, 1.014
2944, 0, 0, 1, 1.014
2944, 14, 0, 0, 0.956
2944, 14, 0, 1, 0.955
2944, 46, 0, 0, 0.979
2944, 46, 0, 1, 0.979
2944, 0, 14, 0, 0.937
2944, 0, 14, 1, 0.937
2944, 0, 46, 0, 0.93
2944, 0, 46, 1, 0.93
2944, 14, 14, 0, 0.953
2944, 14, 14, 1, 0.953
2944, 46, 46, 0, 1.009
2944, 46, 46, 1, 1.009
2944, 2048, 0, 0, 1.015
2944, 2048, 0, 1, 1.015
2944, 2062, 0, 0, 0.955
2944, 2062, 0, 1, 0.955
2944, 2048, 14, 0, 0.769
2944, 2048, 14, 1, 0.769
2944, 2062, 14, 0, 0.923
2944, 2062, 14, 1, 0.923
2944, 14, 1, 0, 0.994
2944, 14, 1, 1, 0.994
2944, 1, 14, 0, 0.927
2944, 1, 14, 1, 0.927
2944, 46, 1, 0, 1.021
2944, 46, 1, 1, 1.021
2944, 1, 46, 0, 0.923
2944, 1, 46, 1, 0.923
2944, 2062, 1, 0, 0.988
2944, 2062, 1, 1, 0.988
2944, 2049, 14, 0, 0.922
2944, 2049, 14, 1, 0.922
3008, 0, 0, 0, 0.994
3008, 0, 0, 1, 0.994
3008, 15, 0, 0, 0.941
3008, 15, 0, 1, 0.941
3008, 47, 0, 0, 0.996
3008, 47, 0, 1, 0.996
3008, 0, 15, 0, 0.929
3008, 0, 15, 1, 0.933
3008, 0, 47, 0, 0.933
3008, 0, 47, 1, 0.933
3008, 15, 15, 0, 0.952
3008, 15, 15, 1, 0.949
3008, 47, 47, 0, 1.003
3008, 47, 47, 1, 1.003
3008, 2048, 0, 0, 0.998
3008, 2048, 0, 1, 0.998
3008, 2063, 0, 0, 0.941
3008, 2063, 0, 1, 0.941
3008, 2048, 15, 0, 0.766
3008, 2048, 15, 1, 0.766
3008, 2063, 15, 0, 0.916
3008, 2063, 15, 1, 0.916
3008, 15, 1, 0, 0.985
3008, 15, 1, 1, 0.985
3008, 1, 15, 0, 0.916
3008, 1, 15, 1, 0.916
3008, 47, 1, 0, 1.014
3008, 47, 1, 1, 1.014
3008, 1, 47, 0, 0.902
3008, 1, 47, 1, 0.902
3008, 2063, 1, 0, 0.981
3008, 2063, 1, 1, 0.981
3008, 2049, 15, 0, 0.912
3008, 2049, 15, 1, 0.913
3072, 0, 0, 0, 1.016
3072, 0, 0, 1, 1.015
3072, 16, 0, 0, 1.045
3072, 16, 0, 1, 1.045
3072, 48, 0, 0, 1.045
3072, 48, 0, 1, 1.045
3072, 0, 16, 0, 1.049
3072, 0, 16, 1, 1.049
3072, 0, 48, 0, 1.049
3072, 0, 48, 1, 1.049
3072, 16, 16, 0, 1.016
3072, 16, 16, 1, 1.016
3072, 48, 48, 0, 1.016
3072, 48, 48, 1, 1.016
3072, 2048, 0, 0, 1.016
3072, 2048, 0, 1, 1.016
3072, 2064, 0, 0, 1.045
3072, 2064, 0, 1, 1.045
3072, 2048, 16, 0, 1.049
3072, 2048, 16, 1, 1.049
3072, 2064, 16, 0, 1.016
3072, 2064, 16, 1, 1.016
3072, 16, 1, 0, 0.815
3072, 16, 1, 1, 0.815
3072, 1, 16, 0, 0.872
3072, 1, 16, 1, 0.872
3072, 48, 1, 0, 1.017
3072, 48, 1, 1, 1.017
3072, 1, 48, 0, 0.872
3072, 1, 48, 1, 0.872
3072, 2064, 1, 0, 0.815
3072, 2064, 1, 1, 0.815
3072, 2049, 16, 0, 0.872
3072, 2049, 16, 1, 0.872
3136, 0, 0, 0, 0.995
3136, 0, 0, 1, 0.995
3136, 17, 0, 0, 0.949
3136, 17, 0, 1, 0.949
3136, 49, 0, 0, 0.987
3136, 49, 0, 1, 0.987
3136, 0, 17, 0, 0.919
3136, 0, 17, 1, 0.917
3136, 0, 49, 0, 0.931
3136, 0, 49, 1, 0.931
3136, 17, 17, 0, 1.122
3136, 17, 17, 1, 1.119
3136, 49, 49, 0, 0.987
3136, 49, 49, 1, 0.987
3136, 2048, 0, 0, 0.997
3136, 2048, 0, 1, 0.997
3136, 2065, 0, 0, 0.949
3136, 2065, 0, 1, 0.949
3136, 2048, 17, 0, 0.896
3136, 2048, 17, 1, 0.896
3136, 2065, 17, 0, 1.122
3136, 2065, 17, 1, 1.119
3136, 17, 1, 0, 1.184
3136, 17, 1, 1, 1.184
3136, 1, 17, 0, 1.124
3136, 1, 17, 1, 1.125
3136, 49, 1, 0, 1.11
3136, 49, 1, 1, 1.108
3136, 1, 49, 0, 1.044
3136, 1, 49, 1, 1.044
3136, 2065, 1, 0, 1.147
3136, 2065, 1, 1, 1.147
3136, 2049, 17, 0, 1.102
3136, 2049, 17, 1, 1.1
3200, 0, 0, 0, 1.006
3200, 0, 0, 1, 1.006
3200, 18, 0, 0, 0.978
3200, 18, 0, 1, 0.978
3200, 50, 0, 0, 0.998
3200, 50, 0, 1, 0.998
3200, 0, 18, 0, 0.932
3200, 0, 18, 1, 0.932
3200, 0, 50, 0, 0.93
3200, 0, 50, 1, 0.93
3200, 18, 18, 0, 1.11
3200, 18, 18, 1, 1.11
3200, 50, 50, 0, 0.994
3200, 50, 50, 1, 0.994
3200, 2048, 0, 0, 1.007
3200, 2048, 0, 1, 1.007
3200, 2066, 0, 0, 0.978
3200, 2066, 0, 1, 0.978
3200, 2048, 18, 0, 0.894
3200, 2048, 18, 1, 0.894
3200, 2066, 18, 0, 1.11
3200, 2066, 18, 1, 1.11
3200, 18, 1, 0, 1.002
3200, 18, 1, 1, 1.002
3200, 1, 18, 0, 0.917
3200, 1, 18, 1, 0.917
3200, 50, 1, 0, 0.963
3200, 50, 1, 1, 0.964
3200, 1, 50, 0, 0.888
3200, 1, 50, 1, 0.888
3200, 2066, 1, 0, 1.002
3200, 2066, 1, 1, 1.002
3200, 2049, 18, 0, 0.914
3200, 2049, 18, 1, 0.914
3264, 0, 0, 0, 0.994
3264, 0, 0, 1, 0.994
3264, 19, 0, 0, 0.959
3264, 19, 0, 1, 0.959
3264, 51, 0, 0, 0.994
3264, 51, 0, 1, 0.994
3264, 0, 19, 0, 0.927
3264, 0, 19, 1, 0.927
3264, 0, 51, 0, 0.927
3264, 0, 51, 1, 0.927
3264, 19, 19, 0, 1.1
3264, 19, 19, 1, 1.1
3264, 51, 51, 0, 0.982
3264, 51, 51, 1, 0.982
3264, 2048, 0, 0, 0.994
3264, 2048, 0, 1, 0.994
3264, 2067, 0, 0, 0.959
3264, 2067, 0, 1, 0.959
3264, 2048, 19, 0, 0.891
3264, 2048, 19, 1, 0.891
3264, 2067, 19, 0, 1.099
3264, 2067, 19, 1, 1.099
3264, 19, 1, 0, 0.977
3264, 19, 1, 1, 0.976
3264, 1, 19, 0, 0.921
3264, 1, 19, 1, 0.921
3264, 51, 1, 0, 0.959
3264, 51, 1, 1, 0.959
3264, 1, 51, 0, 0.886
3264, 1, 51, 1, 0.886
3264, 2067, 1, 0, 0.976
3264, 2067, 1, 1, 0.976
3264, 2049, 19, 0, 0.917
3264, 2049, 19, 1, 0.917
3328, 0, 0, 0, 0.996
3328, 0, 0, 1, 0.992
3328, 20, 0, 0, 0.955
3328, 20, 0, 1, 0.955
3328, 52, 0, 0, 0.99
3328, 52, 0, 1, 0.99
3328, 0, 20, 0, 0.926
3328, 0, 20, 1, 0.923
3328, 0, 52, 0, 0.933
3328, 0, 52, 1, 0.933
3328, 20, 20, 0, 1.11
3328, 20, 20, 1, 1.11
3328, 52, 52, 0, 0.988
3328, 52, 52, 1, 0.988
3328, 2048, 0, 0, 0.993
3328, 2048, 0, 1, 0.993
3328, 2068, 0, 0, 0.955
3328, 2068, 0, 1, 0.955
3328, 2048, 20, 0, 0.9
3328, 2048, 20, 1, 0.9
3328, 2068, 20, 0, 1.109
3328, 2068, 20, 1, 1.109
3328, 20, 1, 0, 0.99
3328, 20, 1, 1, 0.99
3328, 1, 20, 0, 0.922
3328, 1, 20, 1, 0.922
3328, 52, 1, 0, 0.972
3328, 52, 1, 1, 0.972
3328, 1, 52, 0, 0.901
3328, 1, 52, 1, 0.901
3328, 2068, 1, 0, 0.99
3328, 2068, 1, 1, 0.99
3328, 2049, 20, 0, 0.918
3328, 2049, 20, 1, 0.918
3392, 0, 0, 0, 0.998
3392, 0, 0, 1, 1.0
3392, 21, 0, 0, 0.964
3392, 21, 0, 1, 0.964
3392, 53, 0, 0, 0.998
3392, 53, 0, 1, 0.998
3392, 0, 21, 0, 0.932
3392, 0, 21, 1, 0.932
3392, 0, 53, 0, 0.93
3392, 0, 53, 1, 0.93
3392, 21, 21, 0, 1.113
3392, 21, 21, 1, 1.113
3392, 53, 53, 0, 0.983
3392, 53, 53, 1, 0.983
3392, 2048, 0, 0, 1.0
3392, 2048, 0, 1, 1.0
3392, 2069, 0, 0, 0.964
3392, 2069, 0, 1, 0.964
3392, 2048, 21, 0, 0.895
3392, 2048, 21, 1, 0.896
3392, 2069, 21, 0, 1.113
3392, 2069, 21, 1, 1.113
3392, 21, 1, 0, 0.994
3392, 21, 1, 1, 0.994
3392, 1, 21, 0, 0.923
3392, 1, 21, 1, 0.923
3392, 53, 1, 0, 0.972
3392, 53, 1, 1, 0.972
3392, 1, 53, 0, 0.891
3392, 1, 53, 1, 0.891
3392, 2069, 1, 0, 0.994
3392, 2069, 1, 1, 0.994
3392, 2049, 21, 0, 0.922
3392, 2049, 21, 1, 0.922
3456, 0, 0, 0, 0.995
3456, 0, 0, 1, 0.995
3456, 22, 0, 0, 0.965
3456, 22, 0, 1, 0.965
3456, 54, 0, 0, 0.996
3456, 54, 0, 1, 0.996
3456, 0, 22, 0, 0.927
3456, 0, 22, 1, 0.927
3456, 0, 54, 0, 0.927
3456, 0, 54, 1, 0.927
3456, 22, 22, 0, 1.107
3456, 22, 22, 1, 1.107
3456, 54, 54, 0, 0.98
3456, 54, 54, 1, 0.98
3456, 2048, 0, 0, 0.995
3456, 2048, 0, 1, 0.995
3456, 2070, 0, 0, 0.965
3456, 2070, 0, 1, 0.965
3456, 2048, 22, 0, 0.893
3456, 2048, 22, 1, 0.893
3456, 2070, 22, 0, 1.107
3456, 2070, 22, 1, 1.107
3456, 22, 1, 0, 0.988
3456, 22, 1, 1, 0.988
3456, 1, 22, 0, 0.921
3456, 1, 22, 1, 0.921
3456, 54, 1, 0, 0.963
3456, 54, 1, 1, 0.963
3456, 1, 54, 0, 0.887
3456, 1, 54, 1, 0.887
3456, 2070, 1, 0, 0.988
3456, 2070, 1, 1, 0.988
3456, 2049, 22, 0, 0.917
3456, 2049, 22, 1, 0.917
3520, 0, 0, 0, 1.016
3520, 0, 0, 1, 1.016
3520, 23, 0, 0, 0.957
3520, 23, 0, 1, 0.957
3520, 55, 0, 0, 0.991
3520, 55, 0, 1, 0.991
3520, 0, 23, 0, 0.919
3520, 0, 23, 1, 0.924
3520, 0, 55, 0, 0.934
3520, 0, 55, 1, 0.934
3520, 23, 23, 0, 1.111
3520, 23, 23, 1, 1.111
3520, 55, 55, 0, 0.994
3520, 55, 55, 1, 0.994
3520, 2048, 0, 0, 1.016
3520, 2048, 0, 1, 1.016
3520, 2071, 0, 0, 0.957
3520, 2071, 0, 1, 0.957
3520, 2048, 23, 0, 0.903
3520, 2048, 23, 1, 0.903
3520, 2071, 23, 0, 1.111
3520, 2071, 23, 1, 1.111
3520, 23, 1, 0, 0.997
3520, 23, 1, 1, 0.997
3520, 1, 23, 0, 0.921
3520, 1, 23, 1, 0.921
3520, 55, 1, 0, 0.976
3520, 55, 1, 1, 0.976
3520, 1, 55, 0, 0.902
3520, 1, 55, 1, 0.902
3520, 2071, 1, 0, 0.997
3520, 2071, 1, 1, 0.997
3520, 2049, 23, 0, 0.918
3520, 2049, 23, 1, 0.918
3584, 0, 0, 0, 1.004
3584, 0, 0, 1, 1.004
3584, 24, 0, 0, 0.985
3584, 24, 0, 1, 0.979
3584, 56, 0, 0, 1.006
3584, 56, 0, 1, 1.006
3584, 0, 24, 0, 0.931
3584, 0, 24, 1, 0.931
3584, 0, 56, 0, 0.93
3584, 0, 56, 1, 0.93
3584, 24, 24, 0, 1.111
3584, 24, 24, 1, 1.11
3584, 56, 56, 0, 1.101
3584, 56, 56, 1, 1.1
3584, 2048, 0, 0, 1.005
3584, 2048, 0, 1, 1.005
3584, 2072, 0, 0, 0.98
3584, 2072, 0, 1, 0.978
3584, 2048, 24, 0, 0.896
3584, 2048, 24, 1, 0.897
3584, 2072, 24, 0, 1.111
3584, 2072, 24, 1, 1.111
3584, 24, 1, 0, 1.004
3584, 24, 1, 1, 1.004
3584, 1, 24, 0, 0.921
3584, 1, 24, 1, 0.921
3584, 56, 1, 0, 0.971
3584, 56, 1, 1, 0.97
3584, 1, 56, 0, 0.89
3584, 1, 56, 1, 0.89
3584, 2072, 1, 0, 1.004
3584, 2072, 1, 1, 1.004
3584, 2049, 24, 0, 0.918
3584, 2049, 24, 1, 0.918
3648, 0, 0, 0, 1.012
3648, 0, 0, 1, 1.012
3648, 25, 0, 0, 0.96
3648, 25, 0, 1, 0.96
3648, 57, 0, 0, 0.988
3648, 57, 0, 1, 0.988
3648, 0, 25, 0, 0.927
3648, 0, 25, 1, 0.927
3648, 0, 57, 0, 0.927
3648, 0, 57, 1, 0.927
3648, 25, 25, 0, 1.101
3648, 25, 25, 1, 1.101
3648, 57, 57, 0, 0.986
3648, 57, 57, 1, 0.986
3648, 2048, 0, 0, 1.012
3648, 2048, 0, 1, 1.012
3648, 2073, 0, 0, 0.96
3648, 2073, 0, 1, 0.959
3648, 2048, 25, 0, 0.894
3648, 2048, 25, 1, 0.895
3648, 2073, 25, 0, 1.103
3648, 2073, 25, 1, 1.103
3648, 25, 1, 0, 1.024
3648, 25, 1, 1, 1.024
3648, 1, 25, 0, 0.911
3648, 1, 25, 1, 0.912
3648, 57, 1, 0, 0.973
3648, 57, 1, 1, 0.974
3648, 1, 57, 0, 0.888
3648, 1, 57, 1, 0.888
3648, 2073, 1, 0, 1.024
3648, 2073, 1, 1, 1.024
3648, 2049, 25, 0, 0.907
3648, 2049, 25, 1, 0.907
3712, 0, 0, 0, 0.996
3712, 0, 0, 1, 0.996
3712, 26, 0, 0, 0.96
3712, 26, 0, 1, 0.96
3712, 58, 0, 0, 0.995
3712, 58, 0, 1, 0.995
3712, 0, 26, 0, 0.919
3712, 0, 26, 1, 0.918
3712, 0, 58, 0, 0.93
3712, 0, 58, 1, 0.93
3712, 26, 26, 0, 1.103
3712, 26, 26, 1, 1.102
3712, 58, 58, 0, 0.989
3712, 58, 58, 1, 0.989
3712, 2048, 0, 0, 0.997
3712, 2048, 0, 1, 0.997
3712, 2074, 0, 0, 0.959
3712, 2074, 0, 1, 0.959
3712, 2048, 26, 0, 0.901
3712, 2048, 26, 1, 0.901
3712, 2074, 26, 0, 1.104
3712, 2074, 26, 1, 1.102
3712, 26, 1, 0, 1.001
3712, 26, 1, 1, 1.001
3712, 1, 26, 0, 0.922
3712, 1, 26, 1, 0.922
3712, 58, 1, 0, 0.974
3712, 58, 1, 1, 0.974
3712, 1, 58, 0, 0.903
3712, 1, 58, 1, 0.903
3712, 2074, 1, 0, 1.001
3712, 2074, 1, 1, 1.001
3712, 2049, 26, 0, 0.919
3712, 2049, 26, 1, 0.919
3776, 0, 0, 0, 1.003
3776, 0, 0, 1, 1.003
3776, 27, 0, 0, 0.964
3776, 27, 0, 1, 0.964
3776, 59, 0, 0, 1.004
3776, 59, 0, 1, 1.004
3776, 0, 27, 0, 0.931
3776, 0, 27, 1, 0.931
3776, 0, 59, 0, 0.929
3776, 0, 59, 1, 0.93
3776, 27, 27, 0, 1.097
3776, 27, 27, 1, 1.097
3776, 59, 59, 0, 0.992
3776, 59, 59, 1, 0.992
3776, 2048, 0, 0, 1.003
3776, 2048, 0, 1, 1.003
3776, 2075, 0, 0, 0.963
3776, 2075, 0, 1, 0.964
3776, 2048, 27, 0, 0.898
3776, 2048, 27, 1, 0.898
3776, 2075, 27, 0, 1.097
3776, 2075, 27, 1, 1.097
3776, 27, 1, 0, 0.998
3776, 27, 1, 1, 0.998
3776, 1, 27, 0, 0.925
3776, 1, 27, 1, 0.925
3776, 59, 1, 0, 0.979
3776, 59, 1, 1, 0.979
3776, 1, 59, 0, 0.894
3776, 1, 59, 1, 0.894
3776, 2075, 1, 0, 0.998
3776, 2075, 1, 1, 0.999
3776, 2049, 27, 0, 0.923
3776, 2049, 27, 1, 0.923
3840, 0, 0, 0, 0.997
3840, 0, 0, 1, 0.997
3840, 28, 0, 0, 0.968
3840, 28, 0, 1, 0.968
3840, 60, 0, 0, 1.001
3840, 60, 0, 1, 1.001
3840, 0, 28, 0, 0.926
3840, 0, 28, 1, 0.927
3840, 0, 60, 0, 0.927
3840, 0, 60, 1, 0.927
3840, 28, 28, 0, 1.094
3840, 28, 28, 1, 1.094
3840, 60, 60, 0, 0.982
3840, 60, 60, 1, 0.982
3840, 2048, 0, 0, 0.998
3840, 2048, 0, 1, 0.998
3840, 2076, 0, 0, 0.968
3840, 2076, 0, 1, 0.968
3840, 2048, 28, 0, 0.896
3840, 2048, 28, 1, 0.896
3840, 2076, 28, 0, 1.094
3840, 2076, 28, 1, 1.094
3840, 28, 1, 0, 0.983
3840, 28, 1, 1, 0.982
3840, 1, 28, 0, 0.916
3840, 1, 28, 1, 0.916
3840, 60, 1, 0, 0.969
3840, 60, 1, 1, 0.969
3840, 1, 60, 0, 0.891
3840, 1, 60, 1, 0.891
3840, 2076, 1, 0, 0.983
3840, 2076, 1, 1, 0.983
3840, 2049, 28, 0, 0.912
3840, 2049, 28, 1, 0.912
3904, 0, 0, 0, 1.002
3904, 0, 0, 1, 1.0
3904, 29, 0, 0, 0.961
3904, 29, 0, 1, 0.961
3904, 61, 0, 0, 0.997
3904, 61, 0, 1, 0.997
3904, 0, 29, 0, 0.915
3904, 0, 29, 1, 0.922
3904, 0, 61, 0, 0.933
3904, 0, 61, 1, 0.933
3904, 29, 29, 0, 1.103
3904, 29, 29, 1, 1.103
3904, 61, 61, 0, 0.995
3904, 61, 61, 1, 0.995
3904, 2048, 0, 0, 0.998
3904, 2048, 0, 1, 1.0
3904, 2077, 0, 0, 0.961
3904, 2077, 0, 1, 0.961
3904, 2048, 29, 0, 0.904
3904, 2048, 29, 1, 0.904
3904, 2077, 29, 0, 1.103
3904, 2077, 29, 1, 1.103
3904, 29, 1, 0, 1.0
3904, 29, 1, 1, 1.0
3904, 1, 29, 0, 0.922
3904, 1, 29, 1, 0.922
3904, 61, 1, 0, 0.98
3904, 61, 1, 1, 0.98
3904, 1, 61, 0, 0.904
3904, 1, 61, 1, 0.904
3904, 2077, 1, 0, 1.0
3904, 2077, 1, 1, 1.0
3904, 2049, 29, 0, 0.919
3904, 2049, 29, 1, 0.919
3968, 0, 0, 0, 1.003
3968, 0, 0, 1, 1.003
3968, 30, 0, 0, 0.969
3968, 30, 0, 1, 0.969
3968, 62, 0, 0, 1.006
3968, 62, 0, 1, 1.006
3968, 0, 30, 0, 0.931
3968, 0, 30, 1, 0.93
3968, 0, 62, 0, 0.929
3968, 0, 62, 1, 0.929
3968, 30, 30, 0, 1.103
3968, 30, 30, 1, 1.103
3968, 62, 62, 0, 0.99
3968, 62, 62, 1, 0.99
3968, 2048, 0, 0, 1.004
3968, 2048, 0, 1, 1.004
3968, 2078, 0, 0, 0.969
3968, 2078, 0, 1, 0.969
3968, 2048, 30, 0, 0.899
3968, 2048, 30, 1, 0.899
3968, 2078, 30, 0, 1.105
3968, 2078, 30, 1, 1.105
3968, 30, 1, 0, 0.993
3968, 30, 1, 1, 0.993
3968, 1, 30, 0, 0.908
3968, 1, 30, 1, 0.908
3968, 62, 1, 0, 0.978
3968, 62, 1, 1, 0.978
3968, 1, 62, 0, 0.895
3968, 1, 62, 1, 0.895
3968, 2078, 1, 0, 0.993
3968, 2078, 1, 1, 0.993
3968, 2049, 30, 0, 0.904
3968, 2049, 30, 1, 0.904
4032, 0, 0, 0, 0.995
4032, 0, 0, 1, 0.995
4032, 31, 0, 0, 0.967
4032, 31, 0, 1, 0.967
4032, 63, 0, 0, 1.002
4032, 63, 0, 1, 1.002
4032, 0, 31, 0, 0.927
4032, 0, 31, 1, 0.926
4032, 0, 63, 0, 0.927
4032, 0, 63, 1, 0.927
4032, 31, 31, 0, 1.09
4032, 31, 31, 1, 1.09
4032, 63, 63, 0, 0.987
4032, 63, 63, 1, 0.987
4032, 2048, 0, 0, 0.995
4032, 2048, 0, 1, 0.995
4032, 2079, 0, 0, 0.967
4032, 2079, 0, 1, 0.967
4032, 2048, 31, 0, 0.897
4032, 2048, 31, 1, 0.897
4032, 2079, 31, 0, 1.09
4032, 2079, 31, 1, 1.09
4032, 31, 1, 0, 0.989
4032, 31, 1, 1, 0.989
4032, 1, 31, 0, 0.911
4032, 1, 31, 1, 0.911
4032, 63, 1, 0, 0.971
4032, 63, 1, 1, 0.972
4032, 1, 63, 0, 0.892
4032, 1, 63, 1, 0.892
4032, 2079, 1, 0, 0.989
4032, 2079, 1, 1, 0.989
4032, 2049, 31, 0, 0.907
4032, 2049, 31, 1, 0.907
4096, 32, 0, 0, 1.014
4096, 32, 0, 1, 1.014
4096, 64, 0, 0, 1.014
4096, 64, 0, 1, 1.014
4096, 0, 32, 0, 1.012
4096, 0, 32, 1, 1.012
4096, 0, 64, 0, 1.012
4096, 0, 64, 1, 1.012
4096, 32, 32, 0, 1.014
4096, 32, 32, 1, 1.014
4096, 64, 64, 0, 1.014
4096, 64, 64, 1, 1.014
4096, 2080, 0, 0, 1.014
4096, 2080, 0, 1, 1.014
4096, 2048, 32, 0, 1.014
4096, 2048, 32, 1, 1.014
4096, 2080, 32, 0, 1.014
4096, 2080, 32, 1, 1.014
4096, 32, 1, 0, 0.975
4096, 32, 1, 1, 0.975
4096, 1, 32, 0, 0.769
4096, 1, 32, 1, 0.769
4096, 64, 1, 0, 0.858
4096, 64, 1, 1, 0.858
4096, 1, 64, 0, 0.769
4096, 1, 64, 1, 0.769
4096, 2080, 1, 0, 0.829
4096, 2080, 1, 1, 0.829
4096, 2049, 32, 0, 0.886
4096, 2049, 32, 1, 0.886
4160, 0, 0, 0, 1.003
4160, 0, 0, 1, 1.003
4160, 33, 0, 0, 1.004
4160, 33, 0, 1, 1.004
4160, 65, 0, 0, 0.999
4160, 65, 0, 1, 0.999
4160, 0, 33, 0, 0.931
4160, 0, 33, 1, 0.931
4160, 0, 65, 0, 0.765
4160, 0, 65, 1, 0.765
4160, 33, 33, 0, 0.998
4160, 33, 33, 1, 0.998
4160, 65, 65, 0, 0.942
4160, 65, 65, 1, 0.942
4160, 2048, 0, 0, 1.003
4160, 2048, 0, 1, 1.003
4160, 2081, 0, 0, 1.004
4160, 2081, 0, 1, 1.004
4160, 2048, 33, 0, 0.899
4160, 2048, 33, 1, 0.898
4160, 2081, 33, 0, 1.002
4160, 2081, 33, 1, 1.002
4160, 33, 1, 0, 1.114
4160, 33, 1, 1, 1.114
4160, 1, 33, 0, 1.01
4160, 1, 33, 1, 1.01
4160, 65, 1, 0, 1.077
4160, 65, 1, 1, 1.077
4160, 1, 65, 0, 0.935
4160, 1, 65, 1, 0.935
4160, 2081, 1, 0, 1.077
4160, 2081, 1, 1, 1.077
4160, 2049, 33, 0, 1.007
4160, 2049, 33, 1, 1.007
4224, 0, 0, 0, 1.014
4224, 0, 0, 1, 1.014
4224, 34, 0, 0, 1.0
4224, 34, 0, 1, 1.0
4224, 66, 0, 0, 1.001
4224, 66, 0, 1, 1.001
4224, 0, 34, 0, 0.928
4224, 0, 34, 1, 0.928
4224, 0, 66, 0, 0.762
4224, 0, 66, 1, 0.762
4224, 34, 34, 0, 0.998
4224, 34, 34, 1, 0.998
4224, 66, 66, 0, 0.959
4224, 66, 66, 1, 0.959
4224, 2048, 0, 0, 1.014
4224, 2048, 0, 1, 1.014
4224, 2082, 0, 0, 1.001
4224, 2082, 0, 1, 1.001
4224, 2048, 34, 0, 0.899
4224, 2048, 34, 1, 0.898
4224, 2082, 34, 0, 0.998
4224, 2082, 34, 1, 0.998
4224, 34, 1, 0, 1.024
4224, 34, 1, 1, 1.023
4224, 1, 34, 0, 0.917
4224, 1, 34, 1, 0.917
4224, 66, 1, 0, 1.012
4224, 66, 1, 1, 1.013
4224, 1, 66, 0, 0.917
4224, 1, 66, 1, 0.917
4224, 2082, 1, 0, 1.022
4224, 2082, 1, 1, 1.022
4224, 2049, 34, 0, 0.914
4224, 2049, 34, 1, 0.914
4288, 0, 0, 0, 0.999
4288, 0, 0, 1, 0.999
4288, 35, 0, 0, 0.995
4288, 35, 0, 1, 0.996
4288, 67, 0, 0, 0.998
4288, 67, 0, 1, 0.998
4288, 0, 35, 0, 0.919
4288, 0, 35, 1, 0.918
4288, 0, 67, 0, 0.767
4288, 0, 67, 1, 0.767
4288, 35, 35, 0, 1.005
4288, 35, 35, 1, 1.004
4288, 67, 67, 0, 0.995
4288, 67, 67, 1, 0.995
4288, 2048, 0, 0, 0.999
4288, 2048, 0, 1, 0.999
4288, 2083, 0, 0, 0.995
4288, 2083, 0, 1, 0.995
4288, 2048, 35, 0, 0.905
4288, 2048, 35, 1, 0.904
4288, 2083, 35, 0, 1.005
4288, 2083, 35, 1, 1.004
4288, 35, 1, 0, 1.033
4288, 35, 1, 1, 1.032
4288, 1, 35, 0, 0.928
4288, 1, 35, 1, 0.928
4288, 67, 1, 0, 1.019
4288, 67, 1, 1, 1.02
4288, 1, 67, 0, 0.925
4288, 1, 67, 1, 0.924
4288, 2083, 1, 0, 1.03
4288, 2083, 1, 1, 1.03
4288, 2049, 35, 0, 0.925
4288, 2049, 35, 1, 0.926
4352, 0, 0, 0, 1.005
4352, 0, 0, 1, 1.005
4352, 36, 0, 0, 1.007
4352, 36, 0, 1, 1.006
4352, 68, 0, 0, 1.007
4352, 68, 0, 1, 1.008
4352, 0, 36, 0, 0.929
4352, 0, 36, 1, 0.929
4352, 0, 68, 0, 0.766
4352, 0, 68, 1, 0.766
4352, 36, 36, 0, 0.998
4352, 36, 36, 1, 0.998
4352, 68, 68, 0, 0.964
4352, 68, 68, 1, 0.964
4352, 2048, 0, 0, 1.006
4352, 2048, 0, 1, 1.006
4352, 2084, 0, 0, 1.006
4352, 2084, 0, 1, 1.006
4352, 2048, 36, 0, 0.897
4352, 2048, 36, 1, 0.898
4352, 2084, 36, 0, 0.998
4352, 2084, 36, 1, 0.998
4352, 36, 1, 0, 1.031
4352, 36, 1, 1, 1.031
4352, 1, 36, 0, 0.924
4352, 1, 36, 1, 0.924
4352, 68, 1, 0, 0.999
4352, 68, 1, 1, 0.999
4352, 1, 68, 0, 0.922
4352, 1, 68, 1, 0.922
4352, 2084, 1, 0, 1.03
4352, 2084, 1, 1, 1.03
4352, 2049, 36, 0, 0.922
4352, 2049, 36, 1, 0.922
4416, 0, 0, 0, 0.997
4416, 0, 0, 1, 0.997
4416, 37, 0, 0, 1.002
4416, 37, 0, 1, 1.002
4416, 69, 0, 0, 1.004
4416, 69, 0, 1, 1.004
4416, 0, 37, 0, 0.928
4416, 0, 37, 1, 0.927
4416, 0, 69, 0, 0.762
4416, 0, 69, 1, 0.762
4416, 37, 37, 0, 0.994
4416, 37, 37, 1, 0.994
4416, 69, 69, 0, 0.959
4416, 69, 69, 1, 0.959
4416, 2048, 0, 0, 0.997
4416, 2048, 0, 1, 0.997
4416, 2085, 0, 0, 1.001
4416, 2085, 0, 1, 1.001
4416, 2048, 37, 0, 0.899
4416, 2048, 37, 1, 0.899
4416, 2085, 37, 0, 0.994
4416, 2085, 37, 1, 0.994
4416, 37, 1, 0, 1.024
4416, 37, 1, 1, 1.023
4416, 1, 37, 0, 0.923
4416, 1, 37, 1, 0.922
4416, 69, 1, 0, 1.009
4416, 69, 1, 1, 1.01
4416, 1, 69, 0, 0.917
4416, 1, 69, 1, 0.917
4416, 2085, 1, 0, 1.024
4416, 2085, 1, 1, 1.024
4416, 2049, 37, 0, 0.919
4416, 2049, 37, 1, 0.919
4480, 0, 0, 0, 1.0
4480, 0, 0, 1, 0.999
4480, 38, 0, 0, 0.996
4480, 38, 0, 1, 0.996
4480, 70, 0, 0, 1.0
4480, 70, 0, 1, 1.0
4480, 0, 38, 0, 0.919
4480, 0, 38, 1, 0.921
4480, 0, 70, 0, 0.767
4480, 0, 70, 1, 0.767
4480, 38, 38, 0, 1.002
4480, 38, 38, 1, 1.002
4480, 70, 70, 0, 0.963
4480, 70, 70, 1, 0.963
4480, 2048, 0, 0, 0.998
4480, 2048, 0, 1, 0.999
4480, 2086, 0, 0, 0.996
4480, 2086, 0, 1, 0.995
4480, 2048, 38, 0, 0.907
4480, 2048, 38, 1, 0.907
4480, 2086, 38, 0, 1.002
4480, 2086, 38, 1, 1.002
4480, 38, 1, 0, 1.032
4480, 38, 1, 1, 1.031
4480, 1, 38, 0, 0.919
4480, 1, 38, 1, 0.92
4480, 70, 1, 0, 1.018
4480, 70, 1, 1, 1.017
4480, 1, 70, 0, 0.916
4480, 1, 70, 1, 0.915
4480, 2086, 1, 0, 1.031
4480, 2086, 1, 1, 1.03
4480, 2049, 38, 0, 0.917
4480, 2049, 38, 1, 0.918
4544, 0, 0, 0, 1.002
4544, 0, 0, 1, 1.002
4544, 39, 0, 0, 1.007
4544, 39, 0, 1, 1.008
4544, 71, 0, 0, 1.002
4544, 71, 0, 1, 1.002
4544, 0, 39, 0, 0.93
4544, 0, 39, 1, 0.931
4544, 0, 71, 0, 0.766
4544, 0, 71, 1, 0.766
4544, 39, 39, 0, 1.001
4544, 39, 39, 1, 1.001
4544, 71, 71, 0, 0.966
4544, 71, 71, 1, 0.966
4544, 2048, 0, 0, 1.002
4544, 2048, 0, 1, 1.002
4544, 2087, 0, 0, 1.008
4544, 2087, 0, 1, 1.007
4544, 2048, 39, 0, 0.901
4544, 2048, 39, 1, 0.901
4544, 2087, 39, 0, 1.001
4544, 2087, 39, 1, 1.001
4544, 39, 1, 0, 1.025
4544, 39, 1, 1, 1.025
4544, 1, 39, 0, 0.919
4544, 1, 39, 1, 0.919
4544, 71, 1, 0, 0.991
4544, 71, 1, 1, 0.991
4544, 1, 71, 0, 0.921
4544, 1, 71, 1, 0.922
4544, 2087, 1, 0, 1.025
4544, 2087, 1, 1, 1.025
4544, 2049, 39, 0, 0.917
4544, 2049, 39, 1, 0.917
4608, 0, 0, 0, 0.997
4608, 0, 0, 1, 0.997
4608, 40, 0, 0, 1.013
4608, 40, 0, 1, 1.013
4608, 72, 0, 0, 1.013
4608, 72, 0, 1, 1.013
4608, 0, 40, 0, 0.925
4608, 0, 40, 1, 0.926
4608, 0, 72, 0, 0.765
4608, 0, 72, 1, 0.765
4608, 40, 40, 0, 1.084
4608, 40, 40, 1, 1.084
4608, 72, 72, 0, 0.966
4608, 72, 72, 1, 0.966
4608, 2048, 0, 0, 0.999
4608, 2048, 0, 1, 0.999
4608, 2088, 0, 0, 1.012
4608, 2088, 0, 1, 1.012
4608, 2048, 40, 0, 0.898
4608, 2048, 40, 1, 0.898
4608, 2088, 40, 0, 1.087
4608, 2088, 40, 1, 1.087
4608, 40, 1, 0, 1.006
4608, 40, 1, 1, 1.006
4608, 1, 40, 0, 0.926
4608, 1, 40, 1, 0.925
4608, 72, 1, 0, 1.012
4608, 72, 1, 1, 1.011
4608, 1, 72, 0, 0.92
4608, 1, 72, 1, 0.92
4608, 2088, 1, 0, 1.006
4608, 2088, 1, 1, 1.006
4608, 2049, 40, 0, 0.923
4608, 2049, 40, 1, 0.923
4672, 0, 0, 0, 1.014
4672, 0, 0, 1, 1.014
4672, 41, 0, 0, 1.003
4672, 41, 0, 1, 1.003
4672, 73, 0, 0, 0.983
4672, 73, 0, 1, 0.982
4672, 0, 41, 0, 0.916
4672, 0, 41, 1, 0.918
4672, 0, 73, 0, 0.772
4672, 0, 73, 1, 0.772
4672, 41, 41, 0, 1.012
4672, 41, 41, 1, 1.012
4672, 73, 73, 0, 0.973
4672, 73, 73, 1, 0.973
4672, 2048, 0, 0, 1.014
4672, 2048, 0, 1, 1.014
4672, 2089, 0, 0, 1.002
4672, 2089, 0, 1, 1.002
4672, 2048, 41, 0, 0.907
4672, 2048, 41, 1, 0.908
4672, 2089, 41, 0, 1.012
4672, 2089, 41, 1, 1.012
4672, 41, 1, 0, 1.027
4672, 41, 1, 1, 1.027
4672, 1, 41, 0, 0.928
4672, 1, 41, 1, 0.927
4672, 73, 1, 0, 1.032
4672, 73, 1, 1, 1.03
4672, 1, 73, 0, 0.927
4672, 1, 73, 1, 0.927
4672, 2089, 1, 0, 1.026
4672, 2089, 1, 1, 1.027
4672, 2049, 41, 0, 0.925
4672, 2049, 41, 1, 0.925
4736, 0, 0, 0, 1.005
4736, 0, 0, 1, 1.005
4736, 42, 0, 0, 1.012
4736, 42, 0, 1, 1.012
4736, 74, 0, 0, 0.976
4736, 74, 0, 1, 0.975
4736, 0, 42, 0, 0.93
4736, 0, 42, 1, 0.93
4736, 0, 74, 0, 0.77
4736, 0, 74, 1, 0.77
4736, 42, 42, 0, 1.007
4736, 42, 42, 1, 1.007
4736, 74, 74, 0, 0.965
4736, 74, 74, 1, 0.965
4736, 2048, 0, 0, 1.006
4736, 2048, 0, 1, 1.006
4736, 2090, 0, 0, 1.013
4736, 2090, 0, 1, 1.013
4736, 2048, 42, 0, 0.902
4736, 2048, 42, 1, 0.902
4736, 2090, 42, 0, 1.007
4736, 2090, 42, 1, 1.007
4736, 42, 1, 0, 1.032
4736, 42, 1, 1, 1.032
4736, 1, 42, 0, 0.925
4736, 1, 42, 1, 0.925
4736, 74, 1, 0, 1.018
4736, 74, 1, 1, 1.018
4736, 1, 74, 0, 0.912
4736, 1, 74, 1, 0.912
4736, 2090, 1, 0, 1.032
4736, 2090, 1, 1, 1.032
4736, 2049, 42, 0, 0.923
4736, 2049, 42, 1, 0.923
4800, 0, 0, 0, 1.012
4800, 0, 0, 1, 1.012
4800, 43, 0, 0, 1.008
4800, 43, 0, 1, 1.008
4800, 75, 0, 0, 0.99
4800, 75, 0, 1, 0.99
4800, 0, 43, 0, 0.928
4800, 0, 43, 1, 0.928
4800, 0, 75, 0, 0.767
4800, 0, 75, 1, 0.768
4800, 43, 43, 0, 1.004
4800, 43, 43, 1, 1.004
4800, 75, 75, 0, 0.965
4800, 75, 75, 1, 0.965
4800, 2048, 0, 0, 1.012
4800, 2048, 0, 1, 1.012
4800, 2091, 0, 0, 1.009
4800, 2091, 0, 1, 1.008
4800, 2048, 43, 0, 0.902
4800, 2048, 43, 1, 0.902
4800, 2091, 43, 0, 1.004
4800, 2091, 43, 1, 1.004
4800, 43, 1, 0, 1.026
4800, 43, 1, 1, 1.025
4800, 1, 43, 0, 0.91
4800, 1, 43, 1, 0.91
4800, 75, 1, 0, 0.992
4800, 75, 1, 1, 0.992
4800, 1, 75, 0, 0.921
4800, 1, 75, 1, 0.92
4800, 2091, 1, 0, 1.025
4800, 2091, 1, 1, 1.025
4800, 2049, 43, 0, 0.907
4800, 2049, 43, 1, 0.907
4864, 0, 0, 0, 0.998
4864, 0, 0, 1, 0.998
4864, 44, 0, 0, 1.003
4864, 44, 0, 1, 1.004
4864, 76, 0, 0, 0.987
4864, 76, 0, 1, 0.987
4864, 0, 44, 0, 0.92
4864, 0, 44, 1, 0.921
4864, 0, 76, 0, 0.933
4864, 0, 76, 1, 0.932
4864, 44, 44, 0, 1.006
4864, 44, 44, 1, 1.004
4864, 76, 76, 0, 0.976
4864, 76, 76, 1, 0.975
4864, 2048, 0, 0, 0.999
4864, 2048, 0, 1, 0.999
4864, 2092, 0, 0, 1.004
4864, 2092, 0, 1, 1.005
4864, 2048, 44, 0, 0.907
4864, 2048, 44, 1, 0.907
4864, 2092, 44, 0, 1.006
4864, 2092, 44, 1, 1.005
4864, 44, 1, 0, 1.034
4864, 44, 1, 1, 1.032
4864, 1, 44, 0, 0.908
4864, 1, 44, 1, 0.929
4864, 76, 1, 0, 1.006
4864, 76, 1, 1, 1.005
4864, 1, 76, 0, 0.798
4864, 1, 76, 1, 0.798
4864, 2092, 1, 0, 1.033
4864, 2092, 1, 1, 1.033
4864, 2049, 44, 0, 0.904
4864, 2049, 44, 1, 0.925
4928, 0, 0, 0, 1.005
4928, 0, 0, 1, 1.005
4928, 45, 0, 0, 0.993
4928, 45, 0, 1, 1.012
4928, 77, 0, 0, 0.956
4928, 77, 0, 1, 0.976
4928, 0, 45, 0, 0.933
4928, 0, 45, 1, 0.932
4928, 0, 77, 0, 0.771
4928, 0, 77, 1, 0.771
4928, 45, 45, 0, 1.015
4928, 45, 45, 1, 1.015
4928, 77, 77, 0, 0.972
4928, 77, 77, 1, 0.972
4928, 2048, 0, 0, 1.005
4928, 2048, 0, 1, 1.005
4928, 2093, 0, 0, 0.992
4928, 2093, 0, 1, 1.012
4928, 2048, 45, 0, 0.932
4928, 2048, 45, 1, 0.931
4928, 2093, 45, 0, 1.015
4928, 2093, 45, 1, 1.015
4928, 45, 1, 0, 1.009
4928, 45, 1, 1, 1.032
4928, 1, 45, 0, 0.806
4928, 1, 45, 1, 0.805
4928, 77, 1, 0, 0.981
4928, 77, 1, 1, 1.005
4928, 1, 77, 0, 0.917
4928, 1, 77, 1, 0.917
4928, 2093, 1, 0, 1.008
4928, 2093, 1, 1, 1.032
4928, 2049, 45, 0, 0.794
4928, 2049, 45, 1, 0.794
4992, 0, 0, 0, 0.999
4992, 0, 0, 1, 0.999
4992, 46, 0, 0, 0.985
4992, 46, 0, 1, 1.008
4992, 78, 0, 0, 0.963
4992, 78, 0, 1, 0.984
4992, 0, 46, 0, 0.908
4992, 0, 46, 1, 0.908
4992, 0, 78, 0, 0.752
4992, 0, 78, 1, 0.751
4992, 46, 46, 0, 0.997
4992, 46, 46, 1, 0.997
4992, 78, 78, 0, 0.969
4992, 78, 78, 1, 0.968
4992, 2048, 0, 0, 1.0
4992, 2048, 0, 1, 1.0
4992, 2094, 0, 0, 0.987
4992, 2094, 0, 1, 1.008
4992, 2048, 46, 0, 0.883
4992, 2048, 46, 1, 0.883
4992, 2094, 46, 0, 0.997
4992, 2094, 46, 1, 0.997
4992, 46, 1, 0, 0.998
4992, 46, 1, 1, 1.02
4992, 1, 46, 0, 0.917
4992, 1, 46, 1, 0.917
4992, 78, 1, 0, 0.972
4992, 78, 1, 1, 0.993
4992, 1, 78, 0, 0.919
4992, 1, 78, 1, 0.92
4992, 2094, 1, 0, 0.997
4992, 2094, 1, 1, 1.019
4992, 2049, 46, 0, 0.914
4992, 2049, 46, 1, 0.914
5056, 0, 0, 0, 1.002
5056, 0, 0, 1, 1.0
5056, 47, 0, 0, 1.005
5056, 47, 0, 1, 1.005
5056, 79, 0, 0, 0.989
5056, 79, 0, 1, 0.989
5056, 0, 47, 0, 0.918
5056, 0, 47, 1, 0.919
5056, 0, 79, 0, 0.772
5056, 0, 79, 1, 0.771
5056, 47, 47, 0, 1.006
5056, 47, 47, 1, 1.006
5056, 79, 79, 0, 0.972
5056, 79, 79, 1, 0.972
5056, 2048, 0, 0, 1.001
5056, 2048, 0, 1, 1.0
5056, 2095, 0, 0, 1.004
5056, 2095, 0, 1, 1.004
5056, 2048, 47, 0, 0.908
5056, 2048, 47, 1, 0.909
5056, 2095, 47, 0, 1.006
5056, 2095, 47, 1, 1.006
5056, 47, 1, 0, 1.033
5056, 47, 1, 1, 1.033
5056, 1, 47, 0, 0.919
5056, 1, 47, 1, 0.919
5056, 79, 1, 0, 1.003
5056, 79, 1, 1, 1.005
5056, 1, 79, 0, 0.921
5056, 1, 79, 1, 0.921
5056, 2095, 1, 0, 1.032
5056, 2095, 1, 1, 1.034
5056, 2049, 47, 0, 0.918
5056, 2049, 47, 1, 0.917
5120, 0, 0, 0, 1.003
5120, 0, 0, 1, 1.003
5120, 48, 0, 0, 1.068
5120, 48, 0, 1, 1.068
5120, 80, 0, 0, 1.068
5120, 80, 0, 1, 1.068
5120, 0, 48, 0, 1.065
5120, 0, 48, 1, 1.065
5120, 0, 80, 0, 1.064
5120, 0, 80, 1, 1.065
5120, 48, 48, 0, 1.004
5120, 48, 48, 1, 1.004
5120, 80, 80, 0, 1.005
5120, 80, 80, 1, 1.005
5120, 2048, 0, 0, 1.005
5120, 2048, 0, 1, 1.005
5120, 2096, 0, 0, 1.068
5120, 2096, 0, 1, 1.068
5120, 2048, 48, 0, 1.065
5120, 2048, 48, 1, 1.065
5120, 2096, 48, 0, 1.005
5120, 2096, 48, 1, 1.005
5120, 48, 1, 0, 1.033
5120, 48, 1, 1, 1.031
5120, 1, 48, 0, 0.898
5120, 1, 48, 1, 0.898
5120, 80, 1, 0, 0.844
5120, 80, 1, 1, 0.844
5120, 1, 80, 0, 0.898
5120, 1, 80, 1, 0.898
5120, 2096, 1, 0, 0.856
5120, 2096, 1, 1, 0.855
5120, 2049, 48, 0, 0.898
5120, 2049, 48, 1, 0.898
bench-memcpy-random:
length, New Time / Old Time
32768, 0.866
65536, 0.891
131072, 0.896
262144, 0.901
524288, 0.904
1048576, 0.913
bench-memcpy-large:
length, align0, align1, dst>src, New Time/Old Time
65543, 0, 0, 0, 0.981
65543, 0, 0, 1, 0.981
65551, 0, 3, 0, 1.012
65551, 0, 3, 1, 1.013
65567, 3, 0, 0, 1.019
65567, 3, 0, 1, 1.02
65599, 3, 5, 0, 1.058
65599, 3, 5, 1, 1.061
65536, 0, 127, 0, 1.046
65536, 0, 127, 1, 1.046
65536, 0, 255, 0, 1.071
65536, 0, 255, 1, 1.071
65536, 0, 256, 0, 0.983
65536, 0, 256, 1, 0.984
65536, 0, 4064, 0, 1.017
65536, 0, 4064, 1, 1.018
131079, 0, 0, 0, 0.981
131079, 0, 0, 1, 0.981
131087, 0, 3, 0, 1.017
131087, 0, 3, 1, 1.017
131103, 3, 0, 0, 1.022
131103, 3, 0, 1, 1.022
131135, 3, 5, 0, 1.064
131135, 3, 5, 1, 1.065
131072, 0, 127, 0, 1.05
131072, 0, 127, 1, 1.05
131072, 0, 255, 0, 1.074
131072, 0, 255, 1, 1.074
131072, 0, 256, 0, 0.984
131072, 0, 256, 1, 0.984
131072, 0, 4064, 0, 1.018
131072, 0, 4064, 1, 1.019
262151, 0, 0, 0, 0.985
262151, 0, 0, 1, 0.985
262159, 0, 3, 0, 1.026
262159, 0, 3, 1, 1.026
262175, 3, 0, 0, 1.03
262175, 3, 0, 1, 1.03
262207, 3, 5, 0, 1.07
262207, 3, 5, 1, 1.07
262144, 0, 127, 0, 1.057
262144, 0, 127, 1, 1.057
262144, 0, 255, 0, 1.079
262144, 0, 255, 1, 1.078
262144, 0, 256, 0, 0.988
262144, 0, 256, 1, 0.988
262144, 0, 4064, 0, 1.02
262144, 0, 4064, 1, 1.02
524295, 0, 0, 0, 0.692
524295, 0, 0, 1, 0.692
524303, 0, 3, 0, 0.736
524303, 0, 3, 1, 0.737
524319, 3, 0, 0, 0.758
524319, 3, 0, 1, 0.759
524351, 3, 5, 0, 0.759
524351, 3, 5, 1, 0.759
524288, 0, 127, 0, 1.057
524288, 0, 127, 1, 1.058
524288, 0, 255, 0, 1.079
524288, 0, 255, 1, 1.079
524288, 0, 256, 0, 0.988
524288, 0, 256, 1, 0.988
524288, 0, 4064, 0, 1.02
524288, 0, 4064, 1, 1.02
1048583, 0, 0, 0, 0.948
1048583, 0, 0, 1, 0.948
1048591, 0, 3, 0, 0.735
1048591, 0, 3, 1, 0.735
1048607, 3, 0, 0, 0.757
1048607, 3, 0, 1, 0.758
1048639, 3, 5, 0, 0.758
1048639, 3, 5, 1, 0.758
1048576, 0, 127, 0, 0.761
1048576, 0, 127, 1, 0.762
1048576, 0, 255, 0, 0.751
1048576, 0, 255, 1, 0.751
1048576, 0, 256, 0, 0.93
1048576, 0, 256, 1, 0.93
1048576, 0, 4064, 0, 0.93
1048576, 0, 4064, 1, 0.93
2097159, 0, 0, 0, 0.928
2097159, 0, 0, 1, 0.931
2097167, 0, 3, 0, 0.735
2097167, 0, 3, 1, 0.734
2097183, 3, 0, 0, 0.759
2097183, 3, 0, 1, 0.759
2097215, 3, 5, 0, 0.758
2097215, 3, 5, 1, 0.757
2097152, 0, 127, 0, 0.77
2097152, 0, 127, 1, 0.77
2097152, 0, 255, 0, 0.745
2097152, 0, 255, 1, 0.745
2097152, 0, 256, 0, 0.924
2097152, 0, 256, 1, 0.925
2097152, 0, 4064, 0, 0.926
2097152, 0, 4064, 1, 0.927
4194311, 0, 0, 0, 0.894
4194311, 0, 0, 1, 0.896
4194319, 0, 3, 0, 0.752
4194319, 0, 3, 1, 0.751
4194335, 3, 0, 0, 0.82
4194335, 3, 0, 1, 0.821
4194367, 3, 5, 0, 0.788
4194367, 3, 5, 1, 0.789
4194304, 0, 127, 0, 0.801
4194304, 0, 127, 1, 0.801
4194304, 0, 255, 0, 0.802
4194304, 0, 255, 1, 0.804
4194304, 0, 256, 0, 0.873
4194304, 0, 256, 1, 0.868
4194304, 0, 4064, 0, 0.955
4194304, 0, 4064, 1, 0.954
8388615, 0, 0, 0, 0.885
8388615, 0, 0, 1, 0.886
8388623, 0, 3, 0, 0.769
8388623, 0, 3, 1, 0.769
8388639, 3, 0, 0, 0.87
8388639, 3, 0, 1, 0.87
8388671, 3, 5, 0, 0.811
8388671, 3, 5, 1, 0.814
8388608, 0, 127, 0, 0.83
8388608, 0, 127, 1, 0.83
8388608, 0, 255, 0, 0.857
8388608, 0, 255, 1, 0.857
8388608, 0, 256, 0, 0.851
8388608, 0, 256, 1, 0.848
8388608, 0, 4064, 0, 0.981
8388608, 0, 4064, 1, 0.981
16777223, 0, 0, 0, 0.885
16777223, 0, 0, 1, 0.886
16777231, 0, 3, 0, 0.769
16777231, 0, 3, 1, 0.768
16777247, 3, 0, 0, 0.87
16777247, 3, 0, 1, 0.87
16777279, 3, 5, 0, 0.811
16777279, 3, 5, 1, 0.814
16777216, 0, 127, 0, 0.831
16777216, 0, 127, 1, 0.83
16777216, 0, 255, 0, 0.857
16777216, 0, 255, 1, 0.857
16777216, 0, 256, 0, 0.852
16777216, 0, 256, 1, 0.848
16777216, 0, 4064, 0, 0.98
16777216, 0, 4064, 1, 0.981
33554439, 0, 0, 0, 0.885
33554439, 0, 0, 1, 0.886
33554447, 0, 3, 0, 0.768
33554447, 0, 3, 1, 0.768
33554463, 3, 0, 0, 0.871
33554463, 3, 0, 1, 0.87
33554495, 3, 5, 0, 0.811
33554495, 3, 5, 1, 0.814
33554432, 0, 127, 0, 0.831
33554432, 0, 127, 1, 0.831
33554432, 0, 255, 0, 0.858
33554432, 0, 255, 1, 0.857
33554432, 0, 256, 0, 0.852
33554432, 0, 256, 1, 0.848
33554432, 0, 4064, 0, 0.98
33554432, 0, 4064, 1, 0.981
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
6 files changed, 3572 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (2 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
` (5 subsequent siblings)
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +-
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
5 files changed, 6 insertions(+), 3212 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
memcmpeq-evex \
memcmpeq-sse2 \
memcpy-ssse3 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ return OPTIMIZE (ssse3);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (3 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
` (4 subsequent siblings)
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.
This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.
Aside from this function all other SSSE3 functions should be safe to
remove.
The performance is not changed drastically although shows overall
improvements without any major regressions or gains.
bench-memcpy geometric_mean(N=50) New / Original: 0.962
bench-memcpy-random geometric_mean(N=50) New / Original: 0.895
bench-memcpy-large geometric_mean(N=50) New / Original: 0.894
Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.
More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
sysdeps/x86_64/multiarch/Makefile | 1 -
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ----------------------
sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++-
3 files changed, 382 insertions(+), 3156 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..84e4e0f6cb 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,382 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE __memmove_ssse3
+# define MEMMOVE_CHK __memmove_chk_ssse3
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
+#endif
+
+ .section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
+ jmp L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+ movq %rdi, %rax
+L(start):
+ cmpq $16, %rdx
+ jb L(copy_0_15)
+
+ /* These loads are always useful. */
+ movups 0(%rsi), %xmm0
+ movups -16(%rsi, %rdx), %xmm7
+ cmpq $32, %rdx
+ ja L(more_2x_vec)
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm7, -16(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_4x_vec):
+ movups 16(%rsi), %xmm1
+ movups -32(%rsi, %rdx), %xmm2
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm1, 16(%rdi)
+ movups %xmm2, -32(%rdi, %rdx)
+ movups %xmm7, -16(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_0_15):
+ cmpl $8, %edx
+ ja L(copy_9_15)
+
+ cmpl $4, %edx
+ jb L(copy_0_3)
+
+ movl 0(%rsi), %ecx
+ movl -4(%rsi, %rdx), %esi
+ movl %ecx, 0(%rdi)
+ movl %esi, -4(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_9_15):
+ movq 0(%rsi), %rcx
+ movq -8(%rsi, %rdx), %rsi
+ movq %rcx, 0(%rdi)
+ movq %rsi, -8(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 4
+L(copy_0_3):
+ cmpl $1, %edx
+ jl L(copy_0_0)
+ movzbl (%rsi), %ecx
+ je L(copy_0_1)
+
+ movzwl -2(%rsi, %rdx), %esi
+ movw %si, -2(%rdi, %rdx)
+L(copy_0_1):
+ movb %cl, (%rdi)
+L(copy_0_0):
+L(nop):
+ ret
+
+ .p2align 4
+L(more_2x_vec):
+ cmpq $64, %rdx
+ jbe L(copy_4x_vec)
+
+ /* We use rcx later to get alignr value. */
+ movq %rdi, %rcx
+
+ /* Backward copy for overlap + dst > src for memmove safety. */
+ subq %rsi, %rcx
+ cmpq %rdx, %rcx
+ jb L(copy_backward)
+
+ /* Load tail. */
+
+ /* -16(%rsi, %rdx) already loaded into xmm7. */
+ movups -32(%rsi, %rdx), %xmm8
+ movups -48(%rsi, %rdx), %xmm9
+
+ /* Get misalignment. */
+ andl $0xf, %ecx
+
+ movq %rsi, %r9
+ addq %rcx, %rsi
+ andq $-16, %rsi
+ /* Get first vec for `palignr`. */
+ movaps (%rsi), %xmm1
+
+ /* We have loaded (%rsi) so safe to do this store before the
+ loop. */
+ movups %xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+ cmp __x86_shared_cache_size_half(%rip), %rdx
+#endif
+ ja L(large_memcpy)
+
+ leaq -64(%rdi, %rdx), %r8
+ andq $-16, %rdi
+ movl $48, %edx
+
+ leaq L(loop_fwd_start)(%rip), %r9
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(copy_backward):
+ testq %rcx, %rcx
+ jz L(nop)
+
+ /* Preload tail. */
+
+ /* (%rsi) already loaded into xmm0. */
+ movups 16(%rsi), %xmm4
+ movups 32(%rsi), %xmm5
+
+ movq %rdi, %r8
+ subq %rdi, %rsi
+ leaq -49(%rdi, %rdx), %rdi
+ andq $-16, %rdi
+ addq %rdi, %rsi
+ andq $-16, %rsi
+
+ movaps 48(%rsi), %xmm6
+
+
+ leaq L(loop_bkwd_start)(%rip), %r9
+ andl $0xf, %ecx
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(large_memcpy):
+ movups -64(%r9, %rdx), %xmm10
+ movups -80(%r9, %rdx), %xmm11
+
+ sall $5, %ecx
+ leal (%rcx, %rcx, 2), %r8d
+ leaq -96(%rdi, %rdx), %rcx
+ andq $-16, %rdi
+ leaq L(large_loop_fwd_start)(%rip), %rdx
+ addq %r8, %rdx
+ jmp * %rdx
+
+
+ /* Instead of a typical jump table all 16 loops are exactly
+ 64-bytes in size. So, we can just jump to first loop + r8 *
+ 64. Before modifying any loop ensure all their sizes match!
+ */
+ .p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps %xmm1, 16(%rdi)
+ movaps %xmm2, 32(%rdi)
+ movaps %xmm3, 48(%rdi)
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+ cmpq %rdi, %r8
+ ja L(loop_fwd_0x0)
+L(end_loop_fwd):
+ movups %xmm9, 16(%r8)
+ movups %xmm8, 32(%r8)
+ movups %xmm7, 48(%r8)
+ ret
+
+ /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_FWD(align_by); \
+ .p2align 6; \
+L(loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm4, %xmm1; \
+ movaps %xmm0, 16(%rdi); \
+ movaps %xmm2, 32(%rdi); \
+ movaps %xmm3, 48(%rdi); \
+ addq %rdx, %rdi; \
+ addq %rdx, %rsi; \
+ cmpq %rdi, %r8; \
+ ja L(loop_fwd_ ## align_by); \
+ jmp L(end_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_FWD (0xf)
+ ALIGNED_LOOP_FWD (0xe)
+ ALIGNED_LOOP_FWD (0xd)
+ ALIGNED_LOOP_FWD (0xc)
+ ALIGNED_LOOP_FWD (0xb)
+ ALIGNED_LOOP_FWD (0xa)
+ ALIGNED_LOOP_FWD (0x9)
+ ALIGNED_LOOP_FWD (0x8)
+ ALIGNED_LOOP_FWD (0x7)
+ ALIGNED_LOOP_FWD (0x6)
+ ALIGNED_LOOP_FWD (0x5)
+ ALIGNED_LOOP_FWD (0x4)
+ ALIGNED_LOOP_FWD (0x3)
+ ALIGNED_LOOP_FWD (0x2)
+ ALIGNED_LOOP_FWD (0x1)
+
+ .p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps 64(%rsi), %xmm4
+ movaps 80(%rsi), %xmm5
+ movntps %xmm1, 16(%rdi)
+ movntps %xmm2, 32(%rdi)
+ movntps %xmm3, 48(%rdi)
+ movntps %xmm4, 64(%rdi)
+ movntps %xmm5, 80(%rdi)
+ addq $80, %rdi
+ addq $80, %rsi
+ cmpq %rdi, %rcx
+ ja L(large_loop_fwd_0x0)
+
+ /* Ensure no icache line split on tail. */
+ .p2align 4
+L(end_large_loop_fwd):
+ sfence
+ movups %xmm11, 16(%rcx)
+ movups %xmm10, 32(%rcx)
+ movups %xmm9, 48(%rcx)
+ movups %xmm8, 64(%rcx)
+ movups %xmm7, 80(%rcx)
+ ret
+
+
+ /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+ 96-byte spacing between each. */
+#define ALIGNED_LARGE_LOOP_FWD(align_by); \
+ .p2align 5; \
+L(large_loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps 64(%rsi), %xmm4; \
+ movaps 80(%rsi), %xmm5; \
+ movaps %xmm5, %xmm6; \
+ palignr $align_by, %xmm4, %xmm5; \
+ palignr $align_by, %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm6, %xmm1; \
+ movntps %xmm0, 16(%rdi); \
+ movntps %xmm2, 32(%rdi); \
+ movntps %xmm3, 48(%rdi); \
+ movntps %xmm4, 64(%rdi); \
+ movntps %xmm5, 80(%rdi); \
+ addq $80, %rdi; \
+ addq $80, %rsi; \
+ cmpq %rdi, %rcx; \
+ ja L(large_loop_fwd_ ## align_by); \
+ jmp L(end_large_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LARGE_LOOP_FWD (0xf)
+ ALIGNED_LARGE_LOOP_FWD (0xe)
+ ALIGNED_LARGE_LOOP_FWD (0xd)
+ ALIGNED_LARGE_LOOP_FWD (0xc)
+ ALIGNED_LARGE_LOOP_FWD (0xb)
+ ALIGNED_LARGE_LOOP_FWD (0xa)
+ ALIGNED_LARGE_LOOP_FWD (0x9)
+ ALIGNED_LARGE_LOOP_FWD (0x8)
+ ALIGNED_LARGE_LOOP_FWD (0x7)
+ ALIGNED_LARGE_LOOP_FWD (0x6)
+ ALIGNED_LARGE_LOOP_FWD (0x5)
+ ALIGNED_LARGE_LOOP_FWD (0x4)
+ ALIGNED_LARGE_LOOP_FWD (0x3)
+ ALIGNED_LARGE_LOOP_FWD (0x2)
+ ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+ .p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+ movaps 32(%rsi), %xmm1
+ movaps 16(%rsi), %xmm2
+ movaps 0(%rsi), %xmm3
+ movaps %xmm1, 32(%rdi)
+ movaps %xmm2, 16(%rdi)
+ movaps %xmm3, 0(%rdi)
+ subq $48, %rdi
+ subq $48, %rsi
+ cmpq %rdi, %r8
+ jb L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+ movups %xmm7, -16(%r8, %rdx)
+ movups %xmm0, 0(%r8)
+ movups %xmm4, 16(%r8)
+ movups %xmm5, 32(%r8)
+
+ ret
+
+
+ /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_BKWD(align_by); \
+ .p2align 6; \
+L(loop_bkwd_ ## align_by): \
+ movaps 32(%rsi), %xmm1; \
+ movaps 16(%rsi), %xmm2; \
+ movaps 0(%rsi), %xmm3; \
+ palignr $align_by, %xmm1, %xmm6; \
+ palignr $align_by, %xmm2, %xmm1; \
+ palignr $align_by, %xmm3, %xmm2; \
+ movaps %xmm6, 32(%rdi); \
+ movaps %xmm1, 16(%rdi); \
+ movaps %xmm2, 0(%rdi); \
+ subq $48, %rdi; \
+ subq $48, %rsi; \
+ movaps %xmm3, %xmm6; \
+ cmpq %rdi, %r8; \
+ jb L(loop_bkwd_ ## align_by); \
+ jmp L(end_loop_bkwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_BKWD (0xf)
+ ALIGNED_LOOP_BKWD (0xe)
+ ALIGNED_LOOP_BKWD (0xd)
+ ALIGNED_LOOP_BKWD (0xc)
+ ALIGNED_LOOP_BKWD (0xb)
+ ALIGNED_LOOP_BKWD (0xa)
+ ALIGNED_LOOP_BKWD (0x9)
+ ALIGNED_LOOP_BKWD (0x8)
+ ALIGNED_LOOP_BKWD (0x7)
+ ALIGNED_LOOP_BKWD (0x6)
+ ALIGNED_LOOP_BKWD (0x5)
+ ALIGNED_LOOP_BKWD (0x4)
+ ALIGNED_LOOP_BKWD (0x3)
+ ALIGNED_LOOP_BKWD (0x2)
+ ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (4 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
` (3 subsequent siblings)
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
New code save size (-303 bytes) and has significantly better
performance.
geometric_mean(N=20) of page cross cases New / Original: 0.634
---
sysdeps/x86_64/memcmp.S | 884 ++++++++++++++---------
sysdeps/x86_64/memcmpeq.S | 2 +-
sysdeps/x86_64/multiarch/Makefile | 2 +-
sysdeps/x86_64/multiarch/memcmp-sse2.S | 4 +-
sysdeps/x86_64/multiarch/memcmpeq-sse2.S | 4 +-
sysdeps/x86_64/multiarch/wmemcmp-c.c | 9 -
sysdeps/x86_64/multiarch/wmemcmp-sse2.S | 25 +
sysdeps/x86_64/wmemcmp.S | 21 +
8 files changed, 575 insertions(+), 376 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
create mode 100644 sysdeps/x86_64/wmemcmp.S
diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index e02a53ea1e..b153694048 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -18,395 +18,557 @@
#include <sysdep.h>
- .text
-ENTRY (memcmp)
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+#ifdef USE_AS_WMEMCMP
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define SIZE_OFFSET (0)
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
#endif
- test %RDX_LP, %RDX_LP
- jz L(finz)
- cmpq $1, %rdx
- jbe L(finr1b)
- subq %rdi, %rsi
- movq %rdx, %r10
- cmpq $32, %r10
- jae L(gt32)
- /* Handle small chunks and last block of less than 32 bytes. */
-L(small):
- testq $1, %r10
- jz L(s2b)
- movzbl (%rdi), %eax
- movzbl (%rdi, %rsi), %edx
- subq $1, %r10
- je L(finz1)
- addq $1, %rdi
- subl %edx, %eax
- jnz L(exit)
-L(s2b):
- testq $2, %r10
- jz L(s4b)
- movzwl (%rdi), %eax
- movzwl (%rdi, %rsi), %edx
- subq $2, %r10
+
#ifdef USE_AS_MEMCMPEQ
- je L(finz1)
+# define SIZE_OFFSET (0)
+# define CHECK_CMP(x, y) subl x, y
#else
- je L(fin2_7)
+# ifndef SIZE_OFFSET
+# define SIZE_OFFSET (CHAR_PER_VEC * 2)
+# endif
+# define CHECK_CMP(x, y) cmpl x, y
#endif
- addq $2, %rdi
- cmpl %edx, %eax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+
+#define VEC_SIZE 16
+#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+#ifndef MEMCMP
+# define MEMCMP memcmp
+#endif
+
+ .text
+ENTRY(MEMCMP)
+#ifdef USE_AS_WMEMCMP
+ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+ in ecx for code size. This is preferable to using `incw` as
+ it avoids partial register stalls on older hardware (pre
+ SnB). */
+ movl $0xffff, %ecx
+#endif
+ cmpq $CHAR_PER_VEC, %rdx
+ ja L(more_1x_vec)
+
+#ifdef USE_AS_WMEMCMP
+ /* saves a byte of code keeping the fall through path n = [2, 4]
+ in the initial cache line. */
+ decl %edx
+ jle L(cmp_0_1)
+
+ movq (%rsi), %xmm0
+ movq (%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_0)
+
+ movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_end_0_adj)
#else
- jnz L(fin2_7)
+ cmpl $8, %edx
+ ja L(cmp_9_16)
+
+ cmpl $4, %edx
+ jb L(cmp_0_3)
+
+# ifdef USE_AS_MEMCMPEQ
+ movl (%rsi), %eax
+ subl (%rdi), %eax
+
+ movl -4(%rsi, %rdx), %esi
+ subl -4(%rdi, %rdx), %esi
+
+ orl %esi, %eax
+ ret
+# else
+ /* Combine comparisons for lo and hi 4-byte comparisons. */
+ movl -4(%rsi, %rdx), %ecx
+ movl -4(%rdi, %rdx), %eax
+ shlq $32, %rcx
+ shlq $32, %rax
+ movl (%rsi), %esi
+ movl (%rdi), %edi
+ orq %rsi, %rcx
+ orq %rdi, %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+ xorl %eax, %eax
+ ret
+# endif
+
+ .p2align 4,, 10
+L(cmp_9_16):
+# ifdef USE_AS_MEMCMPEQ
+ movq (%rsi), %rax
+ subq (%rdi), %rax
+
+ movq -8(%rsi, %rdx), %rcx
+ subq -8(%rdi, %rdx), %rcx
+ orq %rcx, %rax
+ /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
+ return long). */
+ setnz %cl
+ movzbl %cl, %eax
+# else
+ movq (%rsi), %rcx
+ movq (%rdi), %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+
+ movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
+ movq -8(%rdi, %rdx, CHAR_SIZE), %rax
+ /* Only compute proper return if not-equal. */
+ cmpq %rcx, %rax
+ jnz L(ret_nonzero)
+ xorl %eax, %eax
+# endif
#endif
-L(s4b):
- testq $4, %r10
- jz L(s8b)
- movl (%rdi), %eax
- movl (%rdi, %rsi), %edx
- subq $4, %r10
-#ifdef USE_AS_MEMCMPEQ
- je L(finz1)
+ ret
+
+ .p2align 4,, 8
+L(cmp_0_1):
+ /* Flag set by earlier comparison against 1. */
+ jne L(cmp_0_0)
+#ifdef USE_AS_WMEMCMP
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(cmp_0_0)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
#else
- je L(fin2_7)
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
#endif
- addq $4, %rdi
- cmpl %edx, %eax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+ ret
+
+ /* Fits in aligning bytes. */
+L(cmp_0_0):
+ xorl %eax, %eax
+ ret
+
+#ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(ret_nonzero_vec_start_0):
+ bsfl %eax, %eax
+ movl (%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+ ret
+#else
+
+# ifndef USE_AS_MEMCMPEQ
+ .p2align 4,, 14
+L(ret_nonzero):
+ /* Need to bswap to get proper return without branch. */
+ bswapq %rcx
+ bswapq %rax
+ subq %rcx, %rax
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+# endif
+
+ .p2align 4
+L(cmp_0_3):
+# ifdef USE_AS_MEMCMPEQ
+ /* No reason to add to dependency chain on rdx. Saving a the
+ bytes here doesn't change number of fetch blocks. */
+ cmpl $1, %edx
+ jbe L(cmp_0_1)
+# else
+ /* We need the code size to prevent taking an extra fetch block.
+ */
+ decl %edx
+ jle L(cmp_0_1)
+# endif
+ movzwl (%rsi), %ecx
+ movzwl (%rdi), %eax
+
+# ifdef USE_AS_MEMCMPEQ
+ subl %ecx, %eax
+
+ movzbl -1(%rsi, %rdx), %esi
+ movzbl -1(%rdi, %rdx), %edi
+ subl %edi, %esi
+ orl %esi, %eax
+# else
+ bswapl %ecx
+ bswapl %eax
+
+ /* Implicit right shift by one. We just need to displace the
+ sign bits. */
+ shrl %ecx
+ shrl %eax
+
+ /* Eat a partial register stall here. Saves code stopping
+ L(cmp_0_3) from bleeding into the next fetch block and saves
+ an ALU. */
+ movb (%rsi, %rdx), %cl
+ movzbl (%rdi, %rdx), %edi
+ orl %edi, %eax
+ subl %ecx, %eax
+# endif
+ ret
+#endif
+
+ .p2align 5
+L(more_1x_vec):
+#ifndef USE_AS_WMEMCMP
+ /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+ in ecx for code size. This is preferable to using `incw` as
+ it avoids partial register stalls on older hardware (pre
+ SnB). */
+ movl $0xffff, %ecx
+#endif
+ movups (%rsi), %xmm0
+ movups (%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_0)
+#if SIZE_OFFSET == 0
+ cmpq $(CHAR_PER_VEC * 2), %rdx
#else
- jnz L(fin2_7)
+ /* Offset rdx. Saves just enough code size to keep the
+ L(last_2x_vec) case and the non-zero return in a single
+ cache line. */
+ subq $(CHAR_PER_VEC * 2), %rdx
#endif
-L(s8b):
- testq $8, %r10
- jz L(s16b)
- movq (%rdi), %rax
- movq (%rdi, %rsi), %rdx
- subq $8, %r10
-#ifdef USE_AS_MEMCMPEQ
- je L(sub_return8)
+ ja L(more_2x_vec)
+
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+#ifndef USE_AS_MEMCMPEQ
+ /* Don't use `incw ax` as machines this code runs on are liable
+ to have partial register stall. */
+ jnz L(ret_nonzero_vec_end_0)
#else
- je L(fin2_7)
+ /* Various return targets for memcmpeq. Will always be hot in
+ Icache and get short encoding. */
+L(ret_nonzero_vec_start_1):
+L(ret_nonzero_vec_start_0):
+L(ret_nonzero_vec_end_0):
#endif
- addq $8, %rdi
- cmpq %rdx, %rax
-#ifdef USE_AS_MEMCMPEQ
- jnz L(neq_early)
+ ret
+
+#ifndef USE_AS_MEMCMPEQ
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(ret_nonzero_vec_end_0_adj):
+ addl $3, %edx
+# else
+ .p2align 4,, 8
+# endif
+L(ret_nonzero_vec_end_0):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leal (%rax, %rdx, CHAR_SIZE), %eax
+ movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addl %edx, %eax
+ movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+# ifndef USE_AS_WMEMCMP
+ .p2align 4,, 10
+L(ret_nonzero_vec_start_0):
+ bsfl %eax, %eax
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+ ret
+# endif
#else
- jnz L(fin2_7)
#endif
-L(s16b):
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
+
+ .p2align 5
+L(more_2x_vec):
+ movups (VEC_SIZE * 1)(%rsi), %xmm0
+ movups (VEC_SIZE * 1)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_vec_start_1)
+
+ cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
+ jbe L(last_2x_vec)
+
+ cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
+ ja L(more_8x_vec)
+
+ /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
+ This can harm performance if non-zero return in [65, 80] or
+ [97, 112] but helps performance otherwise. Generally zero-
+ return is hotter. */
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 2)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 3)(%rsi), %xmm2
+ movups (VEC_SIZE * 3)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
+ jnz L(ret_nonzero_vec_start_2_3)
+
+ cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
+ jbe L(last_2x_vec)
+
+ movups (VEC_SIZE * 4)(%rsi), %xmm0
+ movups (VEC_SIZE * 4)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 5)(%rsi), %xmm2
+ movups (VEC_SIZE * 5)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
#ifdef USE_AS_MEMCMPEQ
- pmovmskb %xmm1, %eax
- subl $0xffff, %eax
+ jz L(last_2x_vec)
ret
#else
- pmovmskb %xmm1, %edx
- xorl %eax, %eax
- subl $0xffff, %edx
- jz L(finz)
- bsfl %edx, %ecx
- leaq (%rdi, %rcx), %rcx
- movzbl (%rcx), %eax
- movzbl (%rsi, %rcx), %edx
- jmp L(finz1)
+ jnz L(ret_nonzero_vec_start_4_5)
#endif
- .p2align 4,, 4
-L(finr1b):
- movzbl (%rdi), %eax
- movzbl (%rsi), %edx
-L(finz1):
- subl %edx, %eax
-L(exit):
- ret
+ .p2align 4
+L(last_2x_vec):
+ movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+ movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
+ movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ subl %ecx, %eax
#ifdef USE_AS_MEMCMPEQ
- .p2align 4,, 4
-L(sub_return8):
- subq %rdx, %rax
- movl %eax, %edx
- shrq $32, %rax
- orl %edx, %eax
+ /* Various return targets for memcmpeq. Will always be hot in
+ Icache and get short encoding. */
+L(ret_nonzero_vec_start_2_3):
+L(ret_nonzero_vec_start_4_5):
ret
#else
- .p2align 4,, 4
-L(fin2_7):
- cmpq %rdx, %rax
- jz L(finz)
- movq %rax, %r11
- subq %rdx, %r11
- bsfq %r11, %rcx
- sarq $3, %rcx
- salq $3, %rcx
- sarq %cl, %rax
- movzbl %al, %eax
- sarq %cl, %rdx
- movzbl %dl, %edx
- subl %edx, %eax
+ jnz L(ret_nonzero_vec_end_1)
ret
-#endif
- .p2align 4,, 4
-L(finz):
- xorl %eax, %eax
+
+ .p2align 4,, 8
+L(ret_nonzero_vec_end_1):
+ pmovmskb %xmm1, %ecx
+ /* High 16 bits of eax guranteed to be all ones. Rotate them in
+ to we can do `or + not` with just `xor`. */
+ rorl $16, %eax
+ xorl %ecx, %eax
+ /* Partial register stall. */
+
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leal (%rax, %rdx, CHAR_SIZE), %eax
+ movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addl %edx, %eax
+ movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
-#ifdef USE_AS_MEMCMPEQ
- .p2align 4,, 4
-L(neq_early):
- movl $1, %eax
+
+ .p2align 4
+L(ret_nonzero_vec_start_4_5):
+ pmovmskb %xmm1, %edx
+ sall $16, %eax
+ leal 1(%rax, %rdx), %eax
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4,, 8
+L(ret_nonzero_vec_start_1):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
#endif
- /* For blocks bigger than 32 bytes
- 1. Advance one of the addr pointer to be 16B aligned.
- 2. Treat the case of both addr pointers aligned to 16B
- separately to avoid movdqu.
- 3. Handle any blocks of greater than 64 consecutive bytes with
- unrolling to reduce branches.
- 4. At least one addr pointer is 16B aligned, use memory version
- of pcmbeqb.
- */
- .p2align 4,, 4
-L(gt32):
- movq %rdx, %r11
- addq %rdi, %r11
- movq %rdi, %r8
-
- andq $15, %r8
- jz L(16am)
- /* Both pointers may be misaligned. */
- movdqu (%rdi), %xmm1
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb %xmm0, %xmm1
- pmovmskb %xmm1, %edx
- subl $0xffff, %edx
- jnz L(neq)
- neg %r8
- leaq 16(%rdi, %r8), %rdi
-L(16am):
- /* Handle two 16B aligned pointers separately. */
- testq $15, %rsi
- jz L(ATR)
- testq $16, %rdi
- jz L(A32)
- movdqu (%rdi, %rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-L(A32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
- /* Pre-unroll to be ready for unrolled 64B loop. */
- testq $32, %rdi
- jz L(A64)
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(A64):
- movq %r11, %r10
- andq $-64, %r10
- cmpq %r10, %rdi
- jae L(mt32)
-
-L(A64main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A64main)
-
-L(mt32):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
-
-L(A32main):
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqu (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %rdi, %r10
- jne L(A32main)
-L(mt16):
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
-
- .p2align 4,, 4
-L(neq):
-#ifdef USE_AS_MEMCMPEQ
- movl $1, %eax
- ret
-#else
- bsfl %edx, %ecx
- movzbl (%rdi, %rcx), %eax
- addq %rdi, %rsi
- movzbl (%rsi,%rcx), %edx
- jmp L(finz1)
+
+ .p2align 4
+L(more_8x_vec):
+ subq %rdi, %rsi
+ leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
+ andq $(VEC_SIZE * -1), %rdi
+ addq %rdi, %rsi
+ .p2align 4
+L(loop_4x):
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 3)(%rsi), %xmm1
+
+ PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
+ PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
+
+ movups (VEC_SIZE * 4)(%rsi), %xmm2
+ movups (VEC_SIZE * 5)(%rsi), %xmm3
+
+ PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
+ PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
+
+ pand %xmm0, %xmm1
+ pand %xmm2, %xmm3
+ pand %xmm1, %xmm3
+
+ pmovmskb %xmm3, %eax
+ subl %ecx, %eax
+ jnz L(ret_nonzero_loop)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+ cmpq %rdi, %rdx
+ ja L(loop_4x)
+ /* Get remaining length in edx. */
+ subl %edi, %edx
+ /* Restore offset so we can reuse L(last_2x_vec). */
+ addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
+#ifdef USE_AS_WMEMCMP
+ shrl $2, %edx
#endif
+ cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
+ jbe L(last_2x_vec)
+
- .p2align 4,, 4
-L(ATR):
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
- testq $16, %rdi
- jz L(ATR32)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- je L(mt16)
-
-L(ATR32):
- movq %r11, %r10
- andq $-64, %r10
- testq $32, %rdi
- jz L(ATR64)
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
-L(ATR64):
- cmpq %rdi, %r10
- je L(mt32)
-
-L(ATR64main):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
- cmpq %rdi, %r10
- jne L(ATR64main)
-
- movq %r11, %r10
- andq $-32, %r10
- cmpq %r10, %rdi
- jae L(mt16)
-
-L(ATR32res):
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- movdqa (%rdi,%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
- pmovmskb %xmm0, %edx
- subl $0xffff, %edx
- jnz L(neq)
- addq $16, %rdi
-
- cmpq %r10, %rdi
- jne L(ATR32res)
-
- subq %rdi, %r11
- je L(finz)
- movq %r11, %r10
- jmp L(small)
- /* Align to 16byte to improve instruction fetch. */
- .p2align 4,, 4
-END(memcmp)
+ movups (VEC_SIZE * 2)(%rsi), %xmm0
+ movups (VEC_SIZE * 2)(%rdi), %xmm1
+ PCMPEQ %xmm0, %xmm1
+ movups (VEC_SIZE * 3)(%rsi), %xmm2
+ movups (VEC_SIZE * 3)(%rdi), %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pand %xmm1, %xmm3
+ pmovmskb %xmm3, %eax
+ CHECK_CMP (%ecx, %eax)
+ jz L(last_2x_vec)
#ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+L(ret_nonzero_loop):
+ ret
#else
-# undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+
+ .p2align 4
+L(ret_nonzero_vec_start_2_3):
+ pmovmskb %xmm1, %edx
+ sall $16, %eax
+ leal 1(%rax, %rdx), %eax
+
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(ret_nonzero_loop):
+ pmovmskb %xmm0, %ecx
+ pmovmskb %xmm1, %edx
+ sall $(VEC_SIZE * 1), %edx
+ leal 1(%rcx, %rdx), %edx
+ pmovmskb %xmm2, %ecx
+ /* High 16 bits of eax guranteed to be all ones. Rotate them in
+ to we can do `or + not` with just `xor`. */
+ rorl $16, %eax
+ xorl %ecx, %eax
+
+ salq $32, %rax
+ orq %rdx, %rax
+
+ bsfq %rax, %rax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+#endif
+END(MEMCMP)
+
+#ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (MEMCMP)
+# else
+# undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+# endif
#endif
diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
index 2cee881fed..80c5e912a6 100644
--- a/sysdeps/x86_64/memcmpeq.S
+++ b/sysdeps/x86_64/memcmpeq.S
@@ -16,6 +16,6 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
-#define memcmp __memcmpeq
+#define MEMCMP __memcmpeq
#define USE_AS_MEMCMPEQ 1
#include "multiarch/memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7ea963fc0..b573966966 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -162,8 +162,8 @@ sysdep_routines += \
wmemchr-sse2 \
wmemcmp-avx2-movbe \
wmemcmp-avx2-movbe-rtm \
- wmemcmp-c \
wmemcmp-evex-movbe \
+ wmemcmp-sse2 \
wmemcmp-sse4 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
index e10555638d..4080fc1875 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@@ -17,8 +17,8 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# ifndef memcmp
-# define memcmp __memcmp_sse2
+# ifndef MEMCMP
+# define MEMCMP __memcmp_sse2
# endif
# ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
index de7f5a7525..9d991e5c74 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@@ -17,9 +17,9 @@
<https://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define memcmp __memcmpeq_sse2
+# define MEMCMP __memcmpeq_sse2
#else
-# define memcmp __memcmpeq
+# define MEMCMP __memcmpeq
#endif
#define USE_AS_MEMCMPEQ 1
#include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e18..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
new file mode 100644
index 0000000000..57be1c446e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
@@ -0,0 +1,25 @@
+/* wmemcmp optimized with SSE2.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+# define MEMCMP __wmemcmp_sse2
+#else
+# define MEMCMP wmemcmp
+#endif
+#define USE_AS_WMEMCMP 1
+#include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
new file mode 100644
index 0000000000..032f389158
--- /dev/null
+++ b/sysdeps/x86_64/wmemcmp.S
@@ -0,0 +1,21 @@
+/* wmemcmp optimized with SSE2.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MEMCMP wmemcmp
+#define USE_AS_WMEMCMP 1
+#include "multiarch/memcmp-sse2.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 5/6] x86: Remove memcmp-sse4.S
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (5 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
` (2 subsequent siblings)
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
Code didn't actually use any sse4 instructions. The new memcmp-sse2
implementation is also faster.
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
Note there are two regressions prefering SSE2 for Size = 1 and Size =
65.
Size = 1:
size, align0, align1, ret, New Time/Old Time
1, 1, 1, 0, 1.2
1, 1, 1, 1, 1.197
1, 1, 1, -1, 1.2
This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).
Python3 Size = 1 -> 13.64%
Python3 Size = [4, 8] -> 60.92%
GCC11 Size = 1 -> 1.29%
GCC11 Size = [4, 8] -> 33.86%
size, align0, align1, ret, New Time/Old Time
4, 4, 4, 0, 0.622
4, 4, 4, 1, 0.797
4, 4, 4, -1, 0.805
5, 5, 5, 0, 0.623
5, 5, 5, 1, 0.777
5, 5, 5, -1, 0.802
6, 6, 6, 0, 0.625
6, 6, 6, 1, 0.813
6, 6, 6, -1, 0.788
7, 7, 7, 0, 0.625
7, 7, 7, 1, 0.799
7, 7, 7, -1, 0.795
8, 8, 8, 0, 0.625
8, 8, 8, 1, 0.848
8, 8, 8, -1, 0.914
9, 9, 9, 0, 0.625
Size = 65:
size, align0, align1, ret, New Time/Old Time
65, 0, 0, 0, 1.103
65, 0, 0, 1, 1.216
65, 0, 0, -1, 1.227
65, 65, 0, 0, 1.091
65, 0, 65, 1, 1.19
65, 65, 65, -1, 1.215
This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.
size, align0, align1, ret, New Time/Old Time
128, 4, 8, 0, 0.858
128, 4, 8, 1, 0.879
128, 4, 8, -1, 0.888
As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 --
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ----
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 ----
3 files changed, 10 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b573966966..0400ea332b 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
- memcmp-sse4 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -164,7 +163,6 @@ sysdep_routines += \
wmemcmp-avx2-movbe-rtm \
wmemcmp-evex-movbe \
wmemcmp-sse2 \
- wmemcmp-sse4 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c6008a73ed..a8afcf81bb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
- __memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
- __wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 44759a3ad5..c743970fe3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -46,8 +45,5 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx2_movbe);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
return OPTIMIZE (sse2);
}
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (6 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
@ 2022-04-10 0:42 ` Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
9 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:42 UTC (permalink / raw)
To: libc-alpha
Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.
geometric_mean(N=20) of page cross cases New / Original: 0.960
size, align0, align1, ret, New Time/Old Time
1, 4095, 0, 0, 1.001
1, 4095, 0, 1, 0.999
1, 4095, 0, -1, 1.0
2, 4094, 0, 0, 1.0
2, 4094, 0, 1, 1.0
2, 4094, 0, -1, 1.0
3, 4093, 0, 0, 1.0
3, 4093, 0, 1, 1.0
3, 4093, 0, -1, 1.0
4, 4092, 0, 0, 0.987
4, 4092, 0, 1, 1.0
4, 4092, 0, -1, 1.0
5, 4091, 0, 0, 0.984
5, 4091, 0, 1, 1.002
5, 4091, 0, -1, 1.005
6, 4090, 0, 0, 0.993
6, 4090, 0, 1, 1.001
6, 4090, 0, -1, 1.003
7, 4089, 0, 0, 0.991
7, 4089, 0, 1, 1.0
7, 4089, 0, -1, 1.001
8, 4088, 0, 0, 0.875
8, 4088, 0, 1, 0.881
8, 4088, 0, -1, 0.888
9, 4087, 0, 0, 0.872
9, 4087, 0, 1, 0.879
9, 4087, 0, -1, 0.883
10, 4086, 0, 0, 0.878
10, 4086, 0, 1, 0.886
10, 4086, 0, -1, 0.873
11, 4085, 0, 0, 0.878
11, 4085, 0, 1, 0.881
11, 4085, 0, -1, 0.879
12, 4084, 0, 0, 0.873
12, 4084, 0, 1, 0.889
12, 4084, 0, -1, 0.875
13, 4083, 0, 0, 0.873
13, 4083, 0, 1, 0.863
13, 4083, 0, -1, 0.863
14, 4082, 0, 0, 0.838
14, 4082, 0, 1, 0.869
14, 4082, 0, -1, 0.877
15, 4081, 0, 0, 0.841
15, 4081, 0, 1, 0.869
15, 4081, 0, -1, 0.876
16, 4080, 0, 0, 0.988
16, 4080, 0, 1, 0.99
16, 4080, 0, -1, 0.989
17, 4079, 0, 0, 0.978
17, 4079, 0, 1, 0.981
17, 4079, 0, -1, 0.98
18, 4078, 0, 0, 0.981
18, 4078, 0, 1, 0.98
18, 4078, 0, -1, 0.985
19, 4077, 0, 0, 0.977
19, 4077, 0, 1, 0.979
19, 4077, 0, -1, 0.986
20, 4076, 0, 0, 0.977
20, 4076, 0, 1, 0.986
20, 4076, 0, -1, 0.984
21, 4075, 0, 0, 0.977
21, 4075, 0, 1, 0.983
21, 4075, 0, -1, 0.988
22, 4074, 0, 0, 0.983
22, 4074, 0, 1, 0.994
22, 4074, 0, -1, 0.993
23, 4073, 0, 0, 0.98
23, 4073, 0, 1, 0.992
23, 4073, 0, -1, 0.995
24, 4072, 0, 0, 0.989
24, 4072, 0, 1, 0.989
24, 4072, 0, -1, 0.991
25, 4071, 0, 0, 0.99
25, 4071, 0, 1, 0.999
25, 4071, 0, -1, 0.996
26, 4070, 0, 0, 0.993
26, 4070, 0, 1, 0.995
26, 4070, 0, -1, 0.998
27, 4069, 0, 0, 0.993
27, 4069, 0, 1, 0.999
27, 4069, 0, -1, 1.0
28, 4068, 0, 0, 0.997
28, 4068, 0, 1, 1.0
28, 4068, 0, -1, 0.999
29, 4067, 0, 0, 0.996
29, 4067, 0, 1, 0.999
29, 4067, 0, -1, 0.999
30, 4066, 0, 0, 0.991
30, 4066, 0, 1, 1.001
30, 4066, 0, -1, 0.999
31, 4065, 0, 0, 0.988
31, 4065, 0, 1, 0.998
31, 4065, 0, -1, 0.998
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
1 file changed, 61 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index a34ea1645d..210c9925b6 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
@@ -457,9 +456,33 @@ L(one_or_less):
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3
2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> Full memcpy ssse3 results. Number are comparison of
> geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz
>
> bench-memcpy:
>
> length, align1, align2, dst > src, New Time / Old Time
> 1, 0, 0, 0, 2.099
> 1, 0, 0, 1, 2.099
> 1, 32, 0, 0, 2.103
> 1, 32, 0, 1, 2.103
> 1, 0, 32, 0, 2.099
> 1, 0, 32, 1, 2.098
> 1, 32, 32, 0, 2.098
> 1, 32, 32, 1, 2.098
> 1, 2048, 0, 0, 2.098
> 1, 2048, 0, 1, 2.098
> 2, 0, 0, 0, 1.135
> 2, 0, 0, 1, 1.136
> 2, 1, 0, 0, 1.139
> 2, 1, 0, 1, 1.139
> 2, 33, 0, 0, 1.165
> 2, 33, 0, 1, 1.139
> 2, 0, 1, 0, 1.136
> 2, 0, 1, 1, 1.136
> 2, 0, 33, 0, 1.136
> 2, 0, 33, 1, 1.136
> 2, 1, 1, 0, 1.136
> 2, 1, 1, 1, 1.136
> 2, 33, 33, 0, 1.136
> 2, 33, 33, 1, 1.136
> 2, 2048, 0, 0, 1.136
> 2, 2048, 0, 1, 1.136
> 2, 2049, 0, 0, 1.191
> 2, 2049, 0, 1, 1.139
> 2, 2048, 1, 0, 1.136
> 2, 2048, 1, 1, 1.136
> 2, 2049, 1, 0, 1.136
> 2, 2049, 1, 1, 1.136
> 4, 0, 0, 0, 1.074
> 4, 0, 0, 1, 0.962
> 4, 2, 0, 0, 0.973
> 4, 2, 0, 1, 0.989
> 4, 34, 0, 0, 0.991
> 4, 34, 0, 1, 0.991
> 4, 0, 2, 0, 0.962
> 4, 0, 2, 1, 0.962
> 4, 0, 34, 0, 0.962
> 4, 0, 34, 1, 0.962
> 4, 2, 2, 0, 0.962
> 4, 2, 2, 1, 0.962
> 4, 34, 34, 0, 0.962
> 4, 34, 34, 1, 0.962
> 4, 2048, 0, 0, 0.962
> 4, 2048, 0, 1, 0.962
> 4, 2050, 0, 0, 0.977
> 4, 2050, 0, 1, 0.979
> 4, 2048, 2, 0, 0.962
> 4, 2048, 2, 1, 0.962
> 4, 2050, 2, 0, 0.962
> 4, 2050, 2, 1, 0.962
> 8, 0, 0, 0, 0.961
> 8, 0, 0, 1, 0.962
> 8, 3, 0, 0, 1.0
> 8, 3, 0, 1, 1.0
> 8, 35, 0, 0, 1.0
> 8, 35, 0, 1, 1.0
> 8, 0, 3, 0, 0.962
> 8, 0, 3, 1, 0.962
> 8, 0, 35, 0, 0.962
> 8, 0, 35, 1, 0.962
> 8, 3, 3, 0, 0.962
> 8, 3, 3, 1, 0.962
> 8, 35, 35, 0, 0.962
> 8, 35, 35, 1, 0.962
> 8, 2048, 0, 0, 0.962
> 8, 2048, 0, 1, 0.962
> 8, 2051, 0, 0, 1.0
> 8, 2051, 0, 1, 1.0
> 8, 2048, 3, 0, 0.962
> 8, 2048, 3, 1, 0.962
> 8, 2051, 3, 0, 0.962
> 8, 2051, 3, 1, 0.962
> 16, 0, 0, 0, 0.798
> 16, 0, 0, 1, 0.799
> 16, 4, 0, 0, 0.8
> 16, 4, 0, 1, 0.801
> 16, 36, 0, 0, 0.801
> 16, 36, 0, 1, 0.8
> 16, 0, 4, 0, 0.798
> 16, 0, 4, 1, 0.798
> 16, 0, 36, 0, 0.798
> 16, 0, 36, 1, 0.798
> 16, 4, 4, 0, 0.798
> 16, 4, 4, 1, 0.798
> 16, 36, 36, 0, 0.798
> 16, 36, 36, 1, 0.798
> 16, 2048, 0, 0, 0.798
> 16, 2048, 0, 1, 0.799
> 16, 2052, 0, 0, 0.8
> 16, 2052, 0, 1, 0.8
> 16, 2048, 4, 0, 0.798
> 16, 2048, 4, 1, 0.798
> 16, 2052, 4, 0, 0.798
> 16, 2052, 4, 1, 0.798
> 32, 0, 0, 0, 0.471
> 32, 0, 0, 1, 0.471
> 32, 5, 0, 0, 0.471
> 32, 5, 0, 1, 0.471
> 32, 37, 0, 0, 0.961
> 32, 37, 0, 1, 0.961
> 32, 0, 5, 0, 0.471
> 32, 0, 5, 1, 0.471
> 32, 0, 37, 0, 1.021
> 32, 0, 37, 1, 1.021
> 32, 5, 5, 0, 0.471
> 32, 5, 5, 1, 0.471
> 32, 37, 37, 0, 1.011
> 32, 37, 37, 1, 1.011
> 32, 2048, 0, 0, 0.471
> 32, 2048, 0, 1, 0.471
> 32, 2053, 0, 0, 0.471
> 32, 2053, 0, 1, 0.471
> 32, 2048, 5, 0, 0.471
> 32, 2048, 5, 1, 0.471
> 32, 2053, 5, 0, 0.471
> 32, 2053, 5, 1, 0.471
> 64, 0, 0, 0, 1.0
> 64, 0, 0, 1, 1.0
> 64, 6, 0, 0, 0.862
> 64, 6, 0, 1, 0.862
> 64, 38, 0, 0, 0.912
> 64, 38, 0, 1, 0.912
> 64, 0, 6, 0, 0.896
> 64, 0, 6, 1, 0.896
> 64, 0, 38, 0, 0.906
> 64, 0, 38, 1, 0.906
> 64, 6, 6, 0, 0.91
> 64, 6, 6, 1, 0.91
> 64, 38, 38, 0, 0.883
> 64, 38, 38, 1, 0.883
> 64, 2048, 0, 0, 1.0
> 64, 2048, 0, 1, 1.0
> 64, 2054, 0, 0, 0.862
> 64, 2054, 0, 1, 0.862
> 64, 2048, 6, 0, 0.887
> 64, 2048, 6, 1, 0.887
> 64, 2054, 6, 0, 0.887
> 64, 2054, 6, 1, 0.887
> 128, 0, 0, 0, 0.857
> 128, 0, 0, 1, 0.857
> 128, 7, 0, 0, 0.875
> 128, 7, 0, 1, 0.875
> 128, 39, 0, 0, 0.892
> 128, 39, 0, 1, 0.892
> 128, 0, 7, 0, 1.183
> 128, 0, 7, 1, 1.183
> 128, 0, 39, 0, 1.113
> 128, 0, 39, 1, 1.113
> 128, 7, 7, 0, 0.692
> 128, 7, 7, 1, 0.692
> 128, 39, 39, 0, 1.104
> 128, 39, 39, 1, 1.104
> 128, 2048, 0, 0, 0.857
> 128, 2048, 0, 1, 0.857
> 128, 2055, 0, 0, 0.875
> 128, 2055, 0, 1, 0.875
> 128, 2048, 7, 0, 0.959
> 128, 2048, 7, 1, 0.959
> 128, 2055, 7, 0, 1.036
> 128, 2055, 7, 1, 1.036
> 256, 0, 0, 0, 0.889
> 256, 0, 0, 1, 0.889
> 256, 8, 0, 0, 0.966
> 256, 8, 0, 1, 0.966
> 256, 40, 0, 0, 0.983
> 256, 40, 0, 1, 0.983
> 256, 0, 8, 0, 1.29
> 256, 0, 8, 1, 1.29
> 256, 0, 40, 0, 1.274
> 256, 0, 40, 1, 1.274
> 256, 8, 8, 0, 0.865
> 256, 8, 8, 1, 0.865
> 256, 40, 40, 0, 1.477
> 256, 40, 40, 1, 1.477
> 256, 2048, 0, 0, 0.889
> 256, 2048, 0, 1, 0.889
> 256, 2056, 0, 0, 0.966
> 256, 2056, 0, 1, 0.966
> 256, 2048, 8, 0, 0.952
> 256, 2048, 8, 1, 0.952
> 256, 2056, 8, 0, 0.878
> 256, 2056, 8, 1, 0.878
> 512, 0, 0, 0, 1.077
> 512, 0, 0, 1, 1.077
> 512, 9, 0, 0, 1.001
> 512, 9, 0, 1, 1.0
> 512, 41, 0, 0, 0.954
> 512, 41, 0, 1, 0.954
> 512, 0, 9, 0, 1.191
> 512, 0, 9, 1, 1.191
> 512, 0, 41, 0, 1.181
> 512, 0, 41, 1, 1.181
> 512, 9, 9, 0, 0.765
> 512, 9, 9, 1, 0.765
> 512, 41, 41, 0, 0.905
> 512, 41, 41, 1, 0.905
> 512, 2048, 0, 0, 1.077
> 512, 2048, 0, 1, 1.077
> 512, 2057, 0, 0, 1.0
> 512, 2057, 0, 1, 1.0
> 512, 2048, 9, 0, 1.0
> 512, 2048, 9, 1, 1.0
> 512, 2057, 9, 0, 0.733
> 512, 2057, 9, 1, 0.733
> 1024, 0, 0, 0, 1.143
> 1024, 0, 0, 1, 1.143
> 1024, 10, 0, 0, 1.015
> 1024, 10, 0, 1, 1.015
> 1024, 42, 0, 0, 1.045
> 1024, 42, 0, 1, 1.045
> 1024, 0, 10, 0, 1.126
> 1024, 0, 10, 1, 1.126
> 1024, 0, 42, 0, 1.114
> 1024, 0, 42, 1, 1.114
> 1024, 10, 10, 0, 0.89
> 1024, 10, 10, 1, 0.89
> 1024, 42, 42, 0, 0.986
> 1024, 42, 42, 1, 0.986
> 1024, 2048, 0, 0, 1.143
> 1024, 2048, 0, 1, 1.143
> 1024, 2058, 0, 0, 1.015
> 1024, 2058, 0, 1, 1.015
> 1024, 2048, 10, 0, 1.03
> 1024, 2048, 10, 1, 1.03
> 1024, 2058, 10, 0, 0.854
> 1024, 2058, 10, 1, 0.854
> 2048, 0, 0, 0, 1.005
> 2048, 0, 0, 1, 1.005
> 2048, 11, 0, 0, 1.013
> 2048, 11, 0, 1, 1.014
> 2048, 43, 0, 0, 1.044
> 2048, 43, 0, 1, 1.044
> 2048, 0, 11, 0, 1.003
> 2048, 0, 11, 1, 1.003
> 2048, 0, 43, 0, 1.003
> 2048, 0, 43, 1, 1.003
> 2048, 11, 11, 0, 0.92
> 2048, 11, 11, 1, 0.92
> 2048, 43, 43, 0, 1.0
> 2048, 43, 43, 1, 1.0
> 2048, 2048, 0, 0, 1.005
> 2048, 2048, 0, 1, 1.005
> 2048, 2059, 0, 0, 0.904
> 2048, 2059, 0, 1, 0.904
> 2048, 2048, 11, 0, 1.0
> 2048, 2048, 11, 1, 1.0
> 2048, 2059, 11, 0, 0.979
> 2048, 2059, 11, 1, 0.979
> 4096, 0, 0, 0, 1.014
> 4096, 0, 0, 1, 1.014
> 4096, 12, 0, 0, 0.855
> 4096, 12, 0, 1, 0.855
> 4096, 44, 0, 0, 0.857
> 4096, 44, 0, 1, 0.857
> 4096, 0, 12, 0, 0.932
> 4096, 0, 12, 1, 0.932
> 4096, 0, 44, 0, 0.932
> 4096, 0, 44, 1, 0.932
> 4096, 12, 12, 0, 0.999
> 4096, 12, 12, 1, 0.999
> 4096, 44, 44, 0, 1.051
> 4096, 44, 44, 1, 1.051
> 4096, 2048, 0, 0, 1.014
> 4096, 2048, 0, 1, 1.014
> 4096, 2060, 0, 0, 0.98
> 4096, 2060, 0, 1, 0.98
> 4096, 2048, 12, 0, 0.77
> 4096, 2048, 12, 1, 0.77
> 4096, 2060, 12, 0, 0.943
> 4096, 2060, 12, 1, 0.943
> 8192, 0, 0, 0, 1.046
> 8192, 0, 0, 1, 1.046
> 8192, 13, 0, 0, 0.885
> 8192, 13, 0, 1, 0.885
> 8192, 45, 0, 0, 0.887
> 8192, 45, 0, 1, 0.886
> 8192, 0, 13, 0, 0.942
> 8192, 0, 13, 1, 0.942
> 8192, 0, 45, 0, 0.942
> 8192, 0, 45, 1, 0.942
> 8192, 13, 13, 0, 1.03
> 8192, 13, 13, 1, 1.03
> 8192, 45, 45, 0, 1.048
> 8192, 45, 45, 1, 1.048
> 8192, 2048, 0, 0, 1.048
> 8192, 2048, 0, 1, 1.048
> 8192, 2061, 0, 0, 1.011
> 8192, 2061, 0, 1, 1.011
> 8192, 2048, 13, 0, 0.789
> 8192, 2048, 13, 1, 0.789
> 8192, 2061, 13, 0, 0.991
> 8192, 2061, 13, 1, 0.991
> 16384, 0, 0, 0, 1.014
> 16384, 0, 0, 1, 1.008
> 16384, 14, 0, 0, 0.951
> 16384, 14, 0, 1, 0.95
> 16384, 46, 0, 0, 0.874
> 16384, 46, 0, 1, 0.871
> 16384, 0, 14, 0, 0.813
> 16384, 0, 14, 1, 0.81
> 16384, 0, 46, 0, 0.85
> 16384, 0, 46, 1, 0.86
> 16384, 14, 14, 0, 0.985
> 16384, 14, 14, 1, 0.975
> 16384, 46, 46, 0, 1.025
> 16384, 46, 46, 1, 1.027
> 16384, 2048, 0, 0, 1.058
> 16384, 2048, 0, 1, 1.058
> 16384, 2062, 0, 0, 0.849
> 16384, 2062, 0, 1, 0.848
> 16384, 2048, 14, 0, 0.907
> 16384, 2048, 14, 1, 0.907
> 16384, 2062, 14, 0, 0.988
> 16384, 2062, 14, 1, 0.995
> 32768, 0, 0, 0, 0.979
> 32768, 0, 0, 1, 0.979
> 32768, 15, 0, 0, 1.006
> 32768, 15, 0, 1, 1.006
> 32768, 47, 0, 0, 1.004
> 32768, 47, 0, 1, 1.004
> 32768, 0, 15, 0, 1.045
> 32768, 0, 15, 1, 1.045
> 32768, 0, 47, 0, 1.011
> 32768, 0, 47, 1, 1.012
> 32768, 15, 15, 0, 0.977
> 32768, 15, 15, 1, 0.977
> 32768, 47, 47, 0, 0.96
> 32768, 47, 47, 1, 0.96
> 32768, 2048, 0, 0, 0.978
> 32768, 2048, 0, 1, 0.978
> 32768, 2063, 0, 0, 1.004
> 32768, 2063, 0, 1, 1.004
> 32768, 2048, 15, 0, 1.036
> 32768, 2048, 15, 1, 1.036
> 32768, 2063, 15, 0, 0.978
> 32768, 2063, 15, 1, 0.978
> 65536, 0, 0, 0, 0.981
> 65536, 0, 0, 1, 0.981
> 65536, 16, 0, 0, 0.987
> 65536, 16, 0, 1, 0.987
> 65536, 48, 0, 0, 0.968
> 65536, 48, 0, 1, 0.968
> 65536, 0, 16, 0, 1.014
> 65536, 0, 16, 1, 1.014
> 65536, 0, 48, 0, 0.984
> 65536, 0, 48, 1, 0.984
> 65536, 16, 16, 0, 1.01
> 65536, 16, 16, 1, 1.01
> 65536, 48, 48, 0, 0.968
> 65536, 48, 48, 1, 0.968
> 65536, 2048, 0, 0, 0.982
> 65536, 2048, 0, 1, 0.982
> 65536, 2064, 0, 0, 0.987
> 65536, 2064, 0, 1, 0.987
> 65536, 2048, 16, 0, 1.012
> 65536, 2048, 16, 1, 1.012
> 65536, 2064, 16, 0, 1.007
> 65536, 2064, 16, 1, 1.007
> 0, 0, 0, 0, 2.104
> 0, 2048, 0, 0, 2.104
> 0, 4095, 0, 0, 2.109
> 0, 0, 4095, 0, 2.103
> 1, 1, 0, 0, 2.104
> 1, 0, 1, 0, 2.098
> 1, 1, 1, 0, 2.098
> 1, 2049, 0, 0, 2.102
> 1, 2048, 1, 0, 2.098
> 1, 2049, 1, 0, 2.098
> 1, 4095, 0, 0, 2.103
> 1, 0, 4095, 0, 2.098
> 2, 2, 0, 0, 1.139
> 2, 0, 2, 0, 1.136
> 2, 2, 2, 0, 1.136
> 2, 2050, 0, 0, 1.139
> 2, 2048, 2, 0, 1.136
> 2, 2050, 2, 0, 1.136
> 2, 4095, 0, 0, 1.0
> 2, 0, 4095, 0, 1.022
> 3, 0, 0, 0, 0.981
> 3, 3, 0, 0, 0.984
> 3, 0, 3, 0, 0.982
> 3, 3, 3, 0, 0.982
> 3, 2048, 0, 0, 0.982
> 3, 2051, 0, 0, 0.983
> 3, 2048, 3, 0, 0.982
> 3, 2051, 3, 0, 0.982
> 3, 4095, 0, 0, 0.285
> 3, 0, 4095, 0, 0.231
> 4, 4, 0, 0, 1.373
> 4, 0, 4, 0, 1.31
> 4, 4, 4, 0, 1.282
> 4, 2052, 0, 0, 1.264
> 4, 2048, 4, 0, 1.254
> 4, 2052, 4, 0, 1.254
> 4, 4095, 0, 0, 1.971
> 4, 0, 4095, 0, 1.994
> 5, 0, 0, 0, 1.145
> 5, 5, 0, 0, 1.155
> 5, 0, 5, 0, 1.171
> 5, 5, 5, 0, 1.171
> 5, 2048, 0, 0, 1.197
> 5, 2053, 0, 0, 1.173
> 5, 2048, 5, 0, 1.171
> 5, 2053, 5, 0, 1.171
> 5, 4095, 0, 0, 0.935
> 5, 0, 4095, 0, 1.017
> 6, 0, 0, 0, 1.145
> 6, 6, 0, 0, 1.098
> 6, 0, 6, 0, 1.096
> 6, 6, 6, 0, 1.096
> 6, 2048, 0, 0, 1.12
> 6, 2054, 0, 0, 1.122
> 6, 2048, 6, 0, 1.12
> 6, 2054, 6, 0, 1.096
> 6, 4095, 0, 0, 0.935
> 6, 0, 4095, 0, 1.018
> 7, 0, 0, 0, 1.071
> 7, 7, 0, 0, 1.074
> 7, 0, 7, 0, 1.072
> 7, 7, 7, 0, 1.072
> 7, 2048, 0, 0, 1.096
> 7, 2055, 0, 0, 1.098
> 7, 2048, 7, 0, 1.096
> 7, 2055, 7, 0, 1.096
> 7, 4095, 0, 0, 0.935
> 7, 0, 4095, 0, 1.016
> 8, 8, 0, 0, 1.167
> 8, 0, 8, 0, 1.028
> 8, 8, 8, 0, 1.028
> 8, 2056, 0, 0, 1.069
> 8, 2048, 8, 0, 1.028
> 8, 2056, 8, 0, 1.028
> 8, 4095, 0, 0, 1.029
> 8, 0, 4095, 0, 1.043
> 9, 0, 0, 0, 0.799
> 9, 9, 0, 0, 0.801
> 9, 0, 9, 0, 0.799
> 9, 9, 9, 0, 0.799
> 9, 2048, 0, 0, 0.8
> 9, 2057, 0, 0, 0.801
> 9, 2048, 9, 0, 0.8
> 9, 2057, 9, 0, 0.799
> 9, 4095, 0, 0, 0.909
> 9, 0, 4095, 0, 1.0
> 10, 0, 0, 0, 0.799
> 10, 10, 0, 0, 0.801
> 10, 0, 10, 0, 0.8
> 10, 10, 10, 0, 0.8
> 10, 2048, 0, 0, 0.8
> 10, 2058, 0, 0, 0.801
> 10, 2048, 10, 0, 0.8
> 10, 2058, 10, 0, 0.8
> 10, 4095, 0, 0, 0.909
> 10, 0, 4095, 0, 1.0
> 11, 0, 0, 0, 0.799
> 11, 11, 0, 0, 0.801
> 11, 0, 11, 0, 0.8
> 11, 11, 11, 0, 0.8
> 11, 2048, 0, 0, 0.8
> 11, 2059, 0, 0, 0.802
> 11, 2048, 11, 0, 0.8
> 11, 2059, 11, 0, 0.8
> 11, 4095, 0, 0, 0.909
> 11, 0, 4095, 0, 1.0
> 12, 0, 0, 0, 0.799
> 12, 12, 0, 0, 0.801
> 12, 0, 12, 0, 0.8
> 12, 12, 12, 0, 0.8
> 12, 2048, 0, 0, 0.8
> 12, 2060, 0, 0, 0.802
> 12, 2048, 12, 0, 0.8
> 12, 2060, 12, 0, 0.8
> 12, 4095, 0, 0, 0.909
> 12, 0, 4095, 0, 1.0
> 13, 0, 0, 0, 0.798
> 13, 13, 0, 0, 0.801
> 13, 0, 13, 0, 0.799
> 13, 13, 13, 0, 0.799
> 13, 2048, 0, 0, 0.8
> 13, 2061, 0, 0, 0.801
> 13, 2048, 13, 0, 0.8
> 13, 2061, 13, 0, 0.8
> 13, 4095, 0, 0, 0.909
> 13, 0, 4095, 0, 1.0
> 14, 0, 0, 0, 0.799
> 14, 14, 0, 0, 0.801
> 14, 0, 14, 0, 0.8
> 14, 14, 14, 0, 0.8
> 14, 2048, 0, 0, 0.8
> 14, 2062, 0, 0, 0.801
> 14, 2048, 14, 0, 0.8
> 14, 2062, 14, 0, 0.8
> 14, 4095, 0, 0, 0.909
> 14, 0, 4095, 0, 1.0
> 15, 0, 0, 0, 0.799
> 15, 15, 0, 0, 0.801
> 15, 0, 15, 0, 0.8
> 15, 15, 15, 0, 0.8
> 15, 2048, 0, 0, 0.8
> 15, 2063, 0, 0, 0.802
> 15, 2048, 15, 0, 0.8
> 15, 2063, 15, 0, 0.8
> 15, 4095, 0, 0, 0.909
> 15, 0, 4095, 0, 1.0
> 16, 16, 0, 0, 0.801
> 16, 0, 16, 0, 0.799
> 16, 16, 16, 0, 0.799
> 16, 2064, 0, 0, 0.801
> 16, 2048, 16, 0, 0.798
> 16, 2064, 16, 0, 0.798
> 16, 4095, 0, 0, 1.818
> 16, 0, 4095, 0, 1.957
> 17, 0, 0, 0, 0.798
> 17, 17, 0, 0, 0.8
> 17, 0, 17, 0, 0.799
> 17, 17, 17, 0, 0.798
> 17, 2048, 0, 0, 0.798
> 17, 2065, 0, 0, 0.8
> 17, 2048, 17, 0, 0.798
> 17, 2065, 17, 0, 0.799
> 17, 4095, 0, 0, 0.937
> 17, 0, 4095, 0, 1.021
> 18, 0, 0, 0, 0.798
> 18, 18, 0, 0, 0.801
> 18, 0, 18, 0, 0.798
> 18, 18, 18, 0, 0.798
> 18, 2048, 0, 0, 0.799
> 18, 2066, 0, 0, 0.8
> 18, 2048, 18, 0, 0.798
> 18, 2066, 18, 0, 0.798
> 18, 4095, 0, 0, 0.937
> 18, 0, 4095, 0, 1.021
> 19, 0, 0, 0, 0.798
> 19, 19, 0, 0, 0.8
> 19, 0, 19, 0, 0.798
> 19, 19, 19, 0, 0.798
> 19, 2048, 0, 0, 0.798
> 19, 2067, 0, 0, 0.8
> 19, 2048, 19, 0, 0.798
> 19, 2067, 19, 0, 0.798
> 19, 4095, 0, 0, 0.937
> 19, 0, 4095, 0, 1.021
> 20, 0, 0, 0, 0.798
> 20, 20, 0, 0, 0.8
> 20, 0, 20, 0, 0.798
> 20, 20, 20, 0, 0.798
> 20, 2048, 0, 0, 0.798
> 20, 2068, 0, 0, 0.8
> 20, 2048, 20, 0, 0.798
> 20, 2068, 20, 0, 0.798
> 20, 4095, 0, 0, 0.937
> 20, 0, 4095, 0, 1.021
> 21, 0, 0, 0, 0.798
> 21, 21, 0, 0, 0.801
> 21, 0, 21, 0, 0.798
> 21, 21, 21, 0, 0.798
> 21, 2048, 0, 0, 0.798
> 21, 2069, 0, 0, 0.801
> 21, 2048, 21, 0, 0.799
> 21, 2069, 21, 0, 0.798
> 21, 4095, 0, 0, 0.937
> 21, 0, 4095, 0, 1.021
> 22, 0, 0, 0, 0.798
> 22, 22, 0, 0, 0.801
> 22, 0, 22, 0, 0.798
> 22, 22, 22, 0, 0.798
> 22, 2048, 0, 0, 0.798
> 22, 2070, 0, 0, 0.801
> 22, 2048, 22, 0, 0.798
> 22, 2070, 22, 0, 0.798
> 22, 4095, 0, 0, 0.937
> 22, 0, 4095, 0, 1.021
> 23, 0, 0, 0, 0.798
> 23, 23, 0, 0, 0.8
> 23, 0, 23, 0, 0.798
> 23, 23, 23, 0, 0.798
> 23, 2048, 0, 0, 0.798
> 23, 2071, 0, 0, 0.8
> 23, 2048, 23, 0, 0.798
> 23, 2071, 23, 0, 0.798
> 23, 4095, 0, 0, 0.937
> 23, 0, 4095, 0, 1.021
> 24, 0, 0, 0, 0.798
> 24, 24, 0, 0, 0.8
> 24, 0, 24, 0, 0.799
> 24, 24, 24, 0, 0.798
> 24, 2048, 0, 0, 0.798
> 24, 2072, 0, 0, 0.801
> 24, 2048, 24, 0, 0.798
> 24, 2072, 24, 0, 0.798
> 24, 4095, 0, 0, 0.937
> 24, 0, 4095, 0, 1.021
> 25, 0, 0, 0, 0.5
> 25, 25, 0, 0, 0.5
> 25, 0, 25, 0, 0.5
> 25, 25, 25, 0, 0.5
> 25, 2048, 0, 0, 0.5
> 25, 2073, 0, 0, 0.501
> 25, 2048, 25, 0, 0.5
> 25, 2073, 25, 0, 0.5
> 25, 4095, 0, 0, 0.974
> 25, 0, 4095, 0, 0.98
> 26, 0, 0, 0, 0.5
> 26, 26, 0, 0, 0.501
> 26, 0, 26, 0, 0.5
> 26, 26, 26, 0, 0.501
> 26, 2048, 0, 0, 0.5
> 26, 2074, 0, 0, 0.5
> 26, 2048, 26, 0, 0.5
> 26, 2074, 26, 0, 0.5
> 26, 4095, 0, 0, 0.974
> 26, 0, 4095, 0, 1.0
> 27, 0, 0, 0, 0.5
> 27, 27, 0, 0, 0.501
> 27, 0, 27, 0, 0.5
> 27, 27, 27, 0, 0.5
> 27, 2048, 0, 0, 0.5
> 27, 2075, 0, 0, 0.5
> 27, 2048, 27, 0, 0.5
> 27, 2075, 27, 0, 0.5
> 27, 4095, 0, 0, 0.974
> 27, 0, 4095, 0, 1.0
> 28, 0, 0, 0, 0.5
> 28, 28, 0, 0, 0.501
> 28, 0, 28, 0, 0.5
> 28, 28, 28, 0, 0.5
> 28, 2048, 0, 0, 0.5
> 28, 2076, 0, 0, 0.5
> 28, 2048, 28, 0, 0.5
> 28, 2076, 28, 0, 0.5
> 28, 4095, 0, 0, 0.974
> 28, 0, 4095, 0, 1.0
> 29, 0, 0, 0, 0.471
> 29, 29, 0, 0, 0.471
> 29, 0, 29, 0, 0.471
> 29, 29, 29, 0, 0.471
> 29, 2048, 0, 0, 0.471
> 29, 2077, 0, 0, 0.471
> 29, 2048, 29, 0, 0.471
> 29, 2077, 29, 0, 0.471
> 29, 4095, 0, 0, 0.974
> 29, 0, 4095, 0, 1.0
> 30, 0, 0, 0, 0.471
> 30, 30, 0, 0, 0.471
> 30, 0, 30, 0, 0.471
> 30, 30, 30, 0, 0.471
> 30, 2048, 0, 0, 0.471
> 30, 2078, 0, 0, 0.471
> 30, 2048, 30, 0, 0.471
> 30, 2078, 30, 0, 0.471
> 30, 4095, 0, 0, 0.974
> 30, 0, 4095, 0, 1.0
> 31, 0, 0, 0, 0.471
> 31, 31, 0, 0, 0.471
> 31, 0, 31, 0, 0.471
> 31, 31, 31, 0, 0.471
> 31, 2048, 0, 0, 0.471
> 31, 2079, 0, 0, 0.471
> 31, 2048, 31, 0, 0.471
> 31, 2079, 31, 0, 0.471
> 31, 4095, 0, 0, 0.974
> 31, 0, 4095, 0, 1.0
> 48, 0, 0, 0, 1.0
> 48, 0, 0, 1, 1.0
> 48, 3, 0, 0, 1.0
> 48, 3, 0, 1, 1.0
> 48, 0, 3, 0, 1.0
> 48, 0, 3, 1, 1.0
> 48, 3, 3, 0, 1.0
> 48, 3, 3, 1, 1.0
> 48, 2048, 0, 0, 1.0
> 48, 2048, 0, 1, 1.0
> 48, 2051, 0, 0, 1.0
> 48, 2051, 0, 1, 1.0
> 48, 2048, 3, 0, 1.0
> 48, 2048, 3, 1, 1.0
> 48, 2051, 3, 0, 1.0
> 48, 2051, 3, 1, 1.0
> 80, 0, 0, 0, 0.781
> 80, 0, 0, 1, 0.782
> 80, 5, 0, 0, 0.976
> 80, 5, 0, 1, 0.976
> 80, 0, 5, 0, 1.232
> 80, 0, 5, 1, 1.232
> 80, 5, 5, 0, 1.542
> 80, 5, 5, 1, 1.543
> 80, 2048, 0, 0, 0.781
> 80, 2048, 0, 1, 0.782
> 80, 2053, 0, 0, 0.976
> 80, 2053, 0, 1, 0.976
> 80, 2048, 5, 0, 1.093
> 80, 2048, 5, 1, 1.093
> 80, 2053, 5, 0, 1.371
> 80, 2053, 5, 1, 1.371
> 96, 0, 0, 0, 0.758
> 96, 0, 0, 1, 0.758
> 96, 6, 0, 0, 0.929
> 96, 6, 0, 1, 0.929
> 96, 0, 6, 0, 1.204
> 96, 0, 6, 1, 1.204
> 96, 6, 6, 0, 1.562
> 96, 6, 6, 1, 1.562
> 96, 2048, 0, 0, 0.758
> 96, 2048, 0, 1, 0.758
> 96, 2054, 0, 0, 0.929
> 96, 2054, 0, 1, 0.929
> 96, 2048, 6, 0, 1.068
> 96, 2048, 6, 1, 1.068
> 96, 2054, 6, 0, 1.562
> 96, 2054, 6, 1, 1.562
> 112, 0, 0, 0, 0.736
> 112, 0, 0, 1, 0.736
> 112, 7, 0, 0, 0.675
> 112, 7, 0, 1, 0.675
> 112, 0, 7, 0, 0.778
> 112, 0, 7, 1, 0.778
> 112, 7, 7, 0, 0.909
> 112, 7, 7, 1, 0.909
> 112, 2048, 0, 0, 0.736
> 112, 2048, 0, 1, 0.736
> 112, 2055, 0, 0, 0.675
> 112, 2055, 0, 1, 0.675
> 112, 2048, 7, 0, 0.778
> 112, 2048, 7, 1, 0.778
> 112, 2055, 7, 0, 0.909
> 112, 2055, 7, 1, 0.909
> 144, 0, 0, 0, 0.857
> 144, 0, 0, 1, 0.857
> 144, 9, 0, 0, 0.941
> 144, 9, 0, 1, 0.943
> 144, 0, 9, 0, 1.137
> 144, 0, 9, 1, 1.137
> 144, 9, 9, 0, 1.514
> 144, 9, 9, 1, 1.514
> 144, 2048, 0, 0, 0.857
> 144, 2048, 0, 1, 0.857
> 144, 2057, 0, 0, 0.939
> 144, 2057, 0, 1, 0.945
> 144, 2048, 9, 0, 0.922
> 144, 2048, 9, 1, 0.922
> 144, 2057, 9, 0, 1.514
> 144, 2057, 9, 1, 1.514
> 160, 0, 0, 0, 0.698
> 160, 0, 0, 1, 0.698
> 160, 10, 0, 0, 0.91
> 160, 10, 0, 1, 0.91
> 160, 0, 10, 0, 1.211
> 160, 0, 10, 1, 1.212
> 160, 10, 10, 0, 1.357
> 160, 10, 10, 1, 1.357
> 160, 2048, 0, 0, 0.698
> 160, 2048, 0, 1, 0.698
> 160, 2058, 0, 0, 0.91
> 160, 2058, 0, 1, 0.91
> 160, 2048, 10, 0, 0.923
> 160, 2048, 10, 1, 0.923
> 160, 2058, 10, 0, 1.357
> 160, 2058, 10, 1, 1.357
> 176, 0, 0, 0, 0.796
> 176, 0, 0, 1, 0.796
> 176, 11, 0, 0, 0.804
> 176, 11, 0, 1, 0.804
> 176, 0, 11, 0, 0.774
> 176, 0, 11, 1, 0.774
> 176, 11, 11, 0, 0.814
> 176, 11, 11, 1, 0.814
> 176, 2048, 0, 0, 0.796
> 176, 2048, 0, 1, 0.796
> 176, 2059, 0, 0, 0.804
> 176, 2059, 0, 1, 0.804
> 176, 2048, 11, 0, 0.774
> 176, 2048, 11, 1, 0.774
> 176, 2059, 11, 0, 0.814
> 176, 2059, 11, 1, 0.814
> 192, 0, 0, 0, 0.778
> 192, 0, 0, 1, 0.778
> 192, 12, 0, 0, 0.881
> 192, 12, 0, 1, 0.881
> 192, 0, 12, 0, 1.167
> 192, 0, 12, 1, 1.167
> 192, 12, 12, 0, 0.841
> 192, 12, 12, 1, 0.841
> 192, 2048, 0, 0, 0.778
> 192, 2048, 0, 1, 0.778
> 192, 2060, 0, 0, 0.881
> 192, 2060, 0, 1, 0.881
> 192, 2048, 12, 0, 0.889
> 192, 2048, 12, 1, 0.889
> 192, 2060, 12, 0, 0.906
> 192, 2060, 12, 1, 0.906
> 208, 0, 0, 0, 0.833
> 208, 0, 0, 1, 0.833
> 208, 13, 0, 0, 0.921
> 208, 13, 0, 1, 0.921
> 208, 0, 13, 0, 0.835
> 208, 0, 13, 1, 0.833
> 208, 13, 13, 0, 1.333
> 208, 13, 13, 1, 1.333
> 208, 2048, 0, 0, 0.833
> 208, 2048, 0, 1, 0.833
> 208, 2061, 0, 0, 0.921
> 208, 2061, 0, 1, 0.921
> 208, 2048, 13, 0, 0.833
> 208, 2048, 13, 1, 0.833
> 208, 2061, 13, 0, 1.333
> 208, 2061, 13, 1, 1.333
> 224, 0, 0, 0, 0.93
> 224, 0, 0, 1, 0.93
> 224, 14, 0, 0, 1.0
> 224, 14, 0, 1, 1.0
> 224, 0, 14, 0, 1.15
> 224, 0, 14, 1, 1.15
> 224, 14, 14, 0, 1.452
> 224, 14, 14, 1, 1.452
> 224, 2048, 0, 0, 0.93
> 224, 2048, 0, 1, 0.93
> 224, 2062, 0, 0, 1.0
> 224, 2062, 0, 1, 1.0
> 224, 2048, 14, 0, 0.833
> 224, 2048, 14, 1, 0.833
> 224, 2062, 14, 0, 1.452
> 224, 2062, 14, 1, 1.452
> 240, 0, 0, 0, 0.909
> 240, 0, 0, 1, 0.909
> 240, 15, 0, 0, 0.797
> 240, 15, 0, 1, 0.797
> 240, 0, 15, 0, 0.771
> 240, 0, 15, 1, 0.771
> 240, 15, 15, 0, 0.93
> 240, 15, 15, 1, 0.93
> 240, 2048, 0, 0, 0.909
> 240, 2048, 0, 1, 0.909
> 240, 2063, 0, 0, 0.797
> 240, 2063, 0, 1, 0.797
> 240, 2048, 15, 0, 0.771
> 240, 2048, 15, 1, 0.771
> 240, 2063, 15, 0, 0.93
> 240, 2063, 15, 1, 0.93
> 272, 0, 0, 0, 0.9
> 272, 0, 0, 1, 0.9
> 272, 17, 0, 0, 1.015
> 272, 17, 0, 1, 1.015
> 272, 0, 17, 0, 0.926
> 272, 0, 17, 1, 0.927
> 272, 17, 17, 0, 0.892
> 272, 17, 17, 1, 0.892
> 272, 2048, 0, 0, 0.9
> 272, 2048, 0, 1, 0.9
> 272, 2065, 0, 0, 1.015
> 272, 2065, 0, 1, 1.015
> 272, 2048, 17, 0, 0.927
> 272, 2048, 17, 1, 0.927
> 272, 2065, 17, 0, 0.878
> 272, 2065, 17, 1, 0.878
> 288, 0, 0, 0, 0.882
> 288, 0, 0, 1, 0.882
> 288, 18, 0, 0, 0.803
> 288, 18, 0, 1, 0.803
> 288, 0, 18, 0, 0.768
> 288, 0, 18, 1, 0.768
> 288, 18, 18, 0, 0.882
> 288, 18, 18, 1, 0.882
> 288, 2048, 0, 0, 0.882
> 288, 2048, 0, 1, 0.882
> 288, 2066, 0, 0, 0.803
> 288, 2066, 0, 1, 0.803
> 288, 2048, 18, 0, 0.768
> 288, 2048, 18, 1, 0.768
> 288, 2066, 18, 0, 0.882
> 288, 2066, 18, 1, 0.882
> 304, 0, 0, 0, 0.865
> 304, 0, 0, 1, 0.865
> 304, 19, 0, 0, 0.944
> 304, 19, 0, 1, 0.944
> 304, 0, 19, 0, 0.943
> 304, 0, 19, 1, 0.943
> 304, 19, 19, 0, 0.956
> 304, 19, 19, 1, 0.956
> 304, 2048, 0, 0, 0.866
> 304, 2048, 0, 1, 0.865
> 304, 2067, 0, 0, 0.944
> 304, 2067, 0, 1, 0.944
> 304, 2048, 19, 0, 0.943
> 304, 2048, 19, 1, 0.943
> 304, 2067, 19, 0, 0.947
> 304, 2067, 19, 1, 0.947
> 320, 0, 0, 0, 0.944
> 320, 0, 0, 1, 0.944
> 320, 20, 0, 0, 0.962
> 320, 20, 0, 1, 0.962
> 320, 0, 20, 0, 1.214
> 320, 0, 20, 1, 1.214
> 320, 20, 20, 0, 1.365
> 320, 20, 20, 1, 1.365
> 320, 2048, 0, 0, 0.943
> 320, 2048, 0, 1, 0.943
> 320, 2068, 0, 0, 0.962
> 320, 2068, 0, 1, 0.962
> 320, 2048, 20, 0, 0.914
> 320, 2048, 20, 1, 0.914
> 320, 2068, 20, 0, 1.365
> 320, 2068, 20, 1, 1.365
> 336, 0, 0, 0, 1.0
> 336, 0, 0, 1, 1.0
> 336, 21, 0, 0, 0.986
> 336, 21, 0, 1, 0.986
> 336, 0, 21, 0, 0.853
> 336, 0, 21, 1, 0.853
> 336, 21, 21, 0, 0.843
> 336, 21, 21, 1, 0.843
> 336, 2048, 0, 0, 1.0
> 336, 2048, 0, 1, 1.0
> 336, 2069, 0, 0, 0.986
> 336, 2069, 0, 1, 0.986
> 336, 2048, 21, 0, 0.853
> 336, 2048, 21, 1, 0.853
> 336, 2069, 21, 0, 0.831
> 336, 2069, 21, 1, 0.831
> 352, 0, 0, 0, 0.98
> 352, 0, 0, 1, 0.98
> 352, 22, 0, 0, 0.811
> 352, 22, 0, 1, 0.811
> 352, 0, 22, 0, 0.882
> 352, 0, 22, 1, 0.882
> 352, 22, 22, 0, 1.1
> 352, 22, 22, 1, 1.1
> 352, 2048, 0, 0, 0.98
> 352, 2048, 0, 1, 0.98
> 352, 2070, 0, 0, 0.811
> 352, 2070, 0, 1, 0.811
> 352, 2048, 22, 0, 0.882
> 352, 2048, 22, 1, 0.882
> 352, 2070, 22, 0, 1.1
> 352, 2070, 22, 1, 1.1
> 368, 0, 0, 0, 1.058
> 368, 0, 0, 1, 1.058
> 368, 23, 0, 0, 1.0
> 368, 23, 0, 1, 1.0
> 368, 0, 23, 0, 0.948
> 368, 0, 23, 1, 0.948
> 368, 23, 23, 0, 0.723
> 368, 23, 23, 1, 0.723
> 368, 2048, 0, 0, 1.058
> 368, 2048, 0, 1, 1.058
> 368, 2071, 0, 0, 1.0
> 368, 2071, 0, 1, 1.0
> 368, 2048, 23, 0, 0.948
> 368, 2048, 23, 1, 0.948
> 368, 2071, 23, 0, 0.701
> 368, 2071, 23, 1, 0.701
> 384, 0, 0, 0, 1.012
> 384, 0, 0, 1, 1.012
> 384, 24, 0, 0, 1.04
> 384, 24, 0, 1, 1.04
> 384, 0, 24, 0, 1.154
> 384, 0, 24, 1, 1.154
> 384, 24, 24, 0, 1.423
> 384, 24, 24, 1, 1.423
> 384, 2048, 0, 0, 1.012
> 384, 2048, 0, 1, 1.012
> 384, 2072, 0, 0, 1.04
> 384, 2072, 0, 1, 1.04
> 384, 2048, 24, 0, 0.91
> 384, 2048, 24, 1, 0.91
> 384, 2072, 24, 0, 1.423
> 384, 2072, 24, 1, 1.423
> 400, 0, 0, 0, 0.948
> 400, 0, 0, 1, 0.948
> 400, 25, 0, 0, 0.957
> 400, 25, 0, 1, 0.957
> 400, 0, 25, 0, 1.099
> 400, 0, 25, 1, 1.069
> 400, 25, 25, 0, 0.885
> 400, 25, 25, 1, 0.885
> 400, 2048, 0, 0, 0.948
> 400, 2048, 0, 1, 0.948
> 400, 2073, 0, 0, 0.957
> 400, 2073, 0, 1, 0.957
> 400, 2048, 25, 0, 0.94
> 400, 2048, 25, 1, 0.94
> 400, 2073, 25, 0, 0.908
> 400, 2073, 25, 1, 0.908
> 416, 0, 0, 0, 1.017
> 416, 0, 0, 1, 1.017
> 416, 26, 0, 0, 0.903
> 416, 26, 0, 1, 0.903
> 416, 0, 26, 0, 0.881
> 416, 0, 26, 1, 0.881
> 416, 26, 26, 0, 1.035
> 416, 26, 26, 1, 1.035
> 416, 2048, 0, 0, 1.017
> 416, 2048, 0, 1, 1.017
> 416, 2074, 0, 0, 0.903
> 416, 2074, 0, 1, 0.903
> 416, 2048, 26, 0, 0.881
> 416, 2048, 26, 1, 0.881
> 416, 2074, 26, 0, 1.034
> 416, 2074, 26, 1, 1.035
> 432, 0, 0, 0, 1.0
> 432, 0, 0, 1, 1.0
> 432, 27, 0, 0, 0.933
> 432, 27, 0, 1, 0.933
> 432, 0, 27, 0, 0.941
> 432, 0, 27, 1, 0.941
> 432, 27, 27, 0, 0.953
> 432, 27, 27, 1, 0.954
> 432, 2048, 0, 0, 1.0
> 432, 2048, 0, 1, 1.0
> 432, 2075, 0, 0, 0.933
> 432, 2075, 0, 1, 0.933
> 432, 2048, 27, 0, 0.941
> 432, 2048, 27, 1, 0.941
> 432, 2075, 27, 0, 0.93
> 432, 2075, 27, 1, 0.93
> 448, 0, 0, 0, 0.984
> 448, 0, 0, 1, 0.984
> 448, 28, 0, 0, 0.896
> 448, 28, 0, 1, 0.896
> 448, 0, 28, 0, 1.244
> 448, 0, 28, 1, 1.244
> 448, 28, 28, 0, 1.333
> 448, 28, 28, 1, 1.333
> 448, 2048, 0, 0, 0.984
> 448, 2048, 0, 1, 0.984
> 448, 2076, 0, 0, 0.896
> 448, 2076, 0, 1, 0.896
> 448, 2048, 28, 0, 0.988
> 448, 2048, 28, 1, 0.988
> 448, 2076, 28, 0, 1.333
> 448, 2076, 28, 1, 1.333
> 464, 0, 0, 0, 1.083
> 464, 0, 0, 1, 1.083
> 464, 29, 0, 0, 0.978
> 464, 29, 0, 1, 0.978
> 464, 0, 29, 0, 0.924
> 464, 0, 29, 1, 0.924
> 464, 29, 29, 0, 0.901
> 464, 29, 29, 1, 0.901
> 464, 2048, 0, 0, 1.083
> 464, 2048, 0, 1, 1.083
> 464, 2077, 0, 0, 0.978
> 464, 2077, 0, 1, 0.978
> 464, 2048, 29, 0, 0.924
> 464, 2048, 29, 1, 0.924
> 464, 2077, 29, 0, 0.89
> 464, 2077, 29, 1, 0.89
> 480, 0, 0, 0, 1.066
> 480, 0, 0, 1, 1.066
> 480, 30, 0, 0, 0.9
> 480, 30, 0, 1, 0.9
> 480, 0, 30, 0, 0.88
> 480, 0, 30, 1, 0.88
> 480, 30, 30, 0, 1.083
> 480, 30, 30, 1, 1.083
> 480, 2048, 0, 0, 1.066
> 480, 2048, 0, 1, 1.066
> 480, 2078, 0, 0, 0.9
> 480, 2078, 0, 1, 0.9
> 480, 2048, 30, 0, 0.88
> 480, 2048, 30, 1, 0.88
> 480, 2078, 30, 0, 1.083
> 480, 2078, 30, 1, 1.083
> 496, 0, 0, 0, 1.032
> 496, 0, 0, 1, 1.032
> 496, 31, 0, 0, 0.95
> 496, 31, 0, 1, 0.95
> 496, 0, 31, 0, 1.011
> 496, 0, 31, 1, 1.011
> 496, 31, 31, 0, 0.973
> 496, 31, 31, 1, 0.973
> 496, 2048, 0, 0, 1.032
> 496, 2048, 0, 1, 1.032
> 496, 2079, 0, 0, 0.95
> 496, 2079, 0, 1, 0.95
> 496, 2048, 31, 0, 1.011
> 496, 2048, 31, 1, 1.011
> 496, 2079, 31, 0, 0.941
> 496, 2079, 31, 1, 0.941
> 1024, 32, 0, 0, 1.143
> 1024, 32, 0, 1, 1.143
> 1024, 0, 32, 0, 1.143
> 1024, 0, 32, 1, 1.143
> 1024, 32, 32, 0, 1.143
> 1024, 32, 32, 1, 1.143
> 1024, 2080, 0, 0, 1.143
> 1024, 2080, 0, 1, 1.143
> 1024, 2048, 32, 0, 1.143
> 1024, 2048, 32, 1, 1.143
> 1024, 2080, 32, 0, 1.143
> 1024, 2080, 32, 1, 1.143
> 1056, 0, 0, 0, 1.168
> 1056, 0, 0, 1, 1.168
> 1056, 33, 0, 0, 1.067
> 1056, 33, 0, 1, 1.067
> 1056, 0, 33, 0, 0.977
> 1056, 0, 33, 1, 0.977
> 1056, 33, 33, 0, 1.043
> 1056, 33, 33, 1, 1.043
> 1056, 2048, 0, 0, 1.168
> 1056, 2048, 0, 1, 1.168
> 1056, 2081, 0, 0, 1.067
> 1056, 2081, 0, 1, 1.067
> 1056, 2048, 33, 0, 0.977
> 1056, 2048, 33, 1, 0.977
> 1056, 2081, 33, 0, 1.0
> 1056, 2081, 33, 1, 1.0
> 1088, 0, 0, 0, 1.171
> 1088, 0, 0, 1, 1.171
> 1088, 34, 0, 0, 1.041
> 1088, 34, 0, 1, 1.041
> 1088, 0, 34, 0, 1.079
> 1088, 0, 34, 1, 1.079
> 1088, 34, 34, 0, 0.966
> 1088, 34, 34, 1, 0.966
> 1088, 2048, 0, 0, 1.171
> 1088, 2048, 0, 1, 1.171
> 1088, 2082, 0, 0, 1.041
> 1088, 2082, 0, 1, 1.041
> 1088, 2048, 34, 0, 0.994
> 1088, 2048, 34, 1, 0.994
> 1088, 2082, 34, 0, 0.966
> 1088, 2082, 34, 1, 0.966
> 1120, 0, 0, 0, 1.152
> 1120, 0, 0, 1, 1.153
> 1120, 35, 0, 0, 1.051
> 1120, 35, 0, 1, 1.051
> 1120, 0, 35, 0, 1.0
> 1120, 0, 35, 1, 1.0
> 1120, 35, 35, 0, 1.068
> 1120, 35, 35, 1, 1.068
> 1120, 2048, 0, 0, 1.151
> 1120, 2048, 0, 1, 1.151
> 1120, 2083, 0, 0, 1.051
> 1120, 2083, 0, 1, 1.051
> 1120, 2048, 35, 0, 1.0
> 1120, 2048, 35, 1, 1.0
> 1120, 2083, 35, 0, 1.027
> 1120, 2083, 35, 1, 1.027
> 1152, 0, 0, 0, 1.159
> 1152, 0, 0, 1, 1.159
> 1152, 36, 0, 0, 1.034
> 1152, 36, 0, 1, 1.034
> 1152, 0, 36, 0, 1.07
> 1152, 0, 36, 1, 1.07
> 1152, 36, 36, 0, 0.967
> 1152, 36, 36, 1, 0.967
> 1152, 2048, 0, 0, 1.159
> 1152, 2048, 0, 1, 1.159
> 1152, 2084, 0, 0, 1.034
> 1152, 2084, 0, 1, 1.034
> 1152, 2048, 36, 0, 0.984
> 1152, 2048, 36, 1, 0.984
> 1152, 2084, 36, 0, 0.967
> 1152, 2084, 36, 1, 0.967
> 1184, 0, 0, 0, 1.157
> 1184, 0, 0, 1, 1.157
> 1184, 37, 0, 0, 1.067
> 1184, 37, 0, 1, 1.066
> 1184, 0, 37, 0, 0.993
> 1184, 0, 37, 1, 0.993
> 1184, 37, 37, 0, 1.08
> 1184, 37, 37, 1, 1.081
> 1184, 2048, 0, 0, 1.157
> 1184, 2048, 0, 1, 1.157
> 1184, 2085, 0, 0, 1.066
> 1184, 2085, 0, 1, 1.066
> 1184, 2048, 37, 0, 0.993
> 1184, 2048, 37, 1, 0.993
> 1184, 2085, 37, 0, 1.04
> 1184, 2085, 37, 1, 1.04
> 1216, 0, 0, 0, 1.139
> 1216, 0, 0, 1, 1.139
> 1216, 38, 0, 0, 1.024
> 1216, 38, 0, 1, 1.024
> 1216, 0, 38, 0, 1.087
> 1216, 0, 38, 1, 1.087
> 1216, 38, 38, 0, 1.0
> 1216, 38, 38, 1, 1.0
> 1216, 2048, 0, 0, 1.138
> 1216, 2048, 0, 1, 1.138
> 1216, 2086, 0, 0, 1.024
> 1216, 2086, 0, 1, 1.024
> 1216, 2048, 38, 0, 1.01
> 1216, 2048, 38, 1, 1.01
> 1216, 2086, 38, 0, 1.0
> 1216, 2086, 38, 1, 1.0
> 1248, 0, 0, 0, 1.176
> 1248, 0, 0, 1, 1.174
> 1248, 39, 0, 0, 1.074
> 1248, 39, 0, 1, 1.074
> 1248, 0, 39, 0, 0.966
> 1248, 0, 39, 1, 0.985
> 1248, 39, 39, 0, 1.064
> 1248, 39, 39, 1, 1.064
> 1248, 2048, 0, 0, 1.179
> 1248, 2048, 0, 1, 1.179
> 1248, 2087, 0, 0, 1.074
> 1248, 2087, 0, 1, 1.074
> 1248, 2048, 39, 0, 0.985
> 1248, 2048, 39, 1, 0.985
> 1248, 2087, 39, 0, 1.026
> 1248, 2087, 39, 1, 1.026
> 1280, 0, 0, 0, 0.993
> 1280, 0, 0, 1, 0.993
> 1280, 40, 0, 0, 1.051
> 1280, 40, 0, 1, 1.051
> 1280, 0, 40, 0, 1.044
> 1280, 0, 40, 1, 1.045
> 1280, 40, 40, 0, 1.25
> 1280, 40, 40, 1, 1.25
> 1280, 2048, 0, 0, 0.992
> 1280, 2048, 0, 1, 0.992
> 1280, 2088, 0, 0, 1.051
> 1280, 2088, 0, 1, 1.051
> 1280, 2048, 40, 0, 0.946
> 1280, 2048, 40, 1, 0.946
> 1280, 2088, 40, 0, 1.252
> 1280, 2088, 40, 1, 1.252
> 1312, 0, 0, 0, 0.969
> 1312, 0, 0, 1, 0.969
> 1312, 41, 0, 0, 0.991
> 1312, 41, 0, 1, 0.991
> 1312, 0, 41, 0, 0.837
> 1312, 0, 41, 1, 0.837
> 1312, 41, 41, 0, 1.025
> 1312, 41, 41, 1, 1.025
> 1312, 2048, 0, 0, 0.969
> 1312, 2048, 0, 1, 0.969
> 1312, 2089, 0, 0, 0.991
> 1312, 2089, 0, 1, 0.99
> 1312, 2048, 41, 0, 0.837
> 1312, 2048, 41, 1, 0.837
> 1312, 2089, 41, 0, 0.975
> 1312, 2089, 41, 1, 0.975
> 1344, 0, 0, 0, 0.988
> 1344, 0, 0, 1, 0.988
> 1344, 42, 0, 0, 1.031
> 1344, 42, 0, 1, 1.031
> 1344, 0, 42, 0, 1.033
> 1344, 0, 42, 1, 1.033
> 1344, 42, 42, 0, 0.982
> 1344, 42, 42, 1, 0.982
> 1344, 2048, 0, 0, 0.992
> 1344, 2048, 0, 1, 0.992
> 1344, 2090, 0, 0, 1.031
> 1344, 2090, 0, 1, 1.031
> 1344, 2048, 42, 0, 0.943
> 1344, 2048, 42, 1, 0.942
> 1344, 2090, 42, 0, 0.982
> 1344, 2090, 42, 1, 0.982
> 1376, 0, 0, 0, 1.016
> 1376, 0, 0, 1, 1.016
> 1376, 43, 0, 0, 1.01
> 1376, 43, 0, 1, 1.01
> 1376, 0, 43, 0, 0.829
> 1376, 0, 43, 1, 0.829
> 1376, 43, 43, 0, 1.024
> 1376, 43, 43, 1, 1.024
> 1376, 2048, 0, 0, 1.006
> 1376, 2048, 0, 1, 1.015
> 1376, 2091, 0, 0, 1.01
> 1376, 2091, 0, 1, 1.01
> 1376, 2048, 43, 0, 0.829
> 1376, 2048, 43, 1, 0.829
> 1376, 2091, 43, 0, 0.98
> 1376, 2091, 43, 1, 0.98
> 1408, 0, 0, 0, 0.987
> 1408, 0, 0, 1, 0.987
> 1408, 44, 0, 0, 1.015
> 1408, 44, 0, 1, 1.015
> 1408, 0, 44, 0, 1.018
> 1408, 0, 44, 1, 1.014
> 1408, 44, 44, 0, 1.004
> 1408, 44, 44, 1, 0.994
> 1408, 2048, 0, 0, 0.988
> 1408, 2048, 0, 1, 0.988
> 1408, 2092, 0, 0, 1.015
> 1408, 2092, 0, 1, 1.015
> 1408, 2048, 44, 0, 0.955
> 1408, 2048, 44, 1, 0.955
> 1408, 2092, 44, 0, 1.0
> 1408, 2092, 44, 1, 0.994
> 1440, 0, 0, 0, 0.986
> 1440, 0, 0, 1, 0.986
> 1440, 45, 0, 0, 1.013
> 1440, 45, 0, 1, 1.013
> 1440, 0, 45, 0, 0.814
> 1440, 0, 45, 1, 0.814
> 1440, 45, 45, 0, 1.006
> 1440, 45, 45, 1, 1.006
> 1440, 2048, 0, 0, 0.986
> 1440, 2048, 0, 1, 0.986
> 1440, 2093, 0, 0, 1.013
> 1440, 2093, 0, 1, 1.013
> 1440, 2048, 45, 0, 0.814
> 1440, 2048, 45, 1, 0.814
> 1440, 2093, 45, 0, 0.966
> 1440, 2093, 45, 1, 0.966
> 1472, 0, 0, 0, 0.997
> 1472, 0, 0, 1, 0.994
> 1472, 46, 0, 0, 1.045
> 1472, 46, 0, 1, 1.045
> 1472, 0, 46, 0, 1.026
> 1472, 0, 46, 1, 1.026
> 1472, 46, 46, 0, 0.966
> 1472, 46, 46, 1, 0.966
> 1472, 2048, 0, 0, 1.0
> 1472, 2048, 0, 1, 0.996
> 1472, 2094, 0, 0, 1.045
> 1472, 2094, 0, 1, 1.045
> 1472, 2048, 46, 0, 0.939
> 1472, 2048, 46, 1, 0.939
> 1472, 2094, 46, 0, 0.966
> 1472, 2094, 46, 1, 0.966
> 1504, 0, 0, 0, 0.993
> 1504, 0, 0, 1, 0.993
> 1504, 47, 0, 0, 0.999
> 1504, 47, 0, 1, 0.999
> 1504, 0, 47, 0, 0.826
> 1504, 0, 47, 1, 0.826
> 1504, 47, 47, 0, 1.023
> 1504, 47, 47, 1, 1.023
> 1504, 2048, 0, 0, 0.993
> 1504, 2048, 0, 1, 0.993
> 1504, 2095, 0, 0, 0.999
> 1504, 2095, 0, 1, 0.999
> 1504, 2048, 47, 0, 0.826
> 1504, 2048, 47, 1, 0.826
> 1504, 2095, 47, 0, 0.993
> 1504, 2095, 47, 1, 0.993
> 1536, 0, 0, 0, 0.992
> 1536, 0, 0, 1, 0.991
> 1536, 48, 0, 0, 1.019
> 1536, 48, 0, 1, 1.019
> 1536, 0, 48, 0, 1.025
> 1536, 0, 48, 1, 1.024
> 1536, 48, 48, 0, 0.994
> 1536, 48, 48, 1, 0.994
> 1536, 2048, 0, 0, 0.994
> 1536, 2048, 0, 1, 0.994
> 1536, 2096, 0, 0, 1.019
> 1536, 2096, 0, 1, 1.019
> 1536, 2048, 48, 0, 1.025
> 1536, 2048, 48, 1, 1.025
> 1536, 2096, 48, 0, 0.994
> 1536, 2096, 48, 1, 0.994
> 1568, 0, 0, 0, 0.994
> 1568, 0, 0, 1, 0.994
> 1568, 49, 0, 0, 0.903
> 1568, 49, 0, 1, 0.903
> 1568, 0, 49, 0, 1.144
> 1568, 0, 49, 1, 1.144
> 1568, 49, 49, 0, 1.461
> 1568, 49, 49, 1, 1.461
> 1568, 2048, 0, 0, 0.993
> 1568, 2048, 0, 1, 0.993
> 1568, 2097, 0, 0, 0.903
> 1568, 2097, 0, 1, 0.903
> 1568, 2048, 49, 0, 1.09
> 1568, 2048, 49, 1, 1.09
> 1568, 2097, 49, 0, 1.46
> 1568, 2097, 49, 1, 1.46
> 1600, 0, 0, 0, 0.981
> 1600, 0, 0, 1, 0.981
> 1600, 50, 0, 0, 1.022
> 1600, 50, 0, 1, 1.022
> 1600, 0, 50, 0, 1.017
> 1600, 0, 50, 1, 1.017
> 1600, 50, 50, 0, 0.973
> 1600, 50, 50, 1, 0.973
> 1600, 2048, 0, 0, 0.981
> 1600, 2048, 0, 1, 0.981
> 1600, 2098, 0, 0, 1.022
> 1600, 2098, 0, 1, 1.022
> 1600, 2048, 50, 0, 0.961
> 1600, 2048, 50, 1, 0.961
> 1600, 2098, 50, 0, 0.973
> 1600, 2098, 50, 1, 0.973
> 1632, 0, 0, 0, 1.019
> 1632, 0, 0, 1, 1.019
> 1632, 51, 0, 0, 0.893
> 1632, 51, 0, 1, 0.893
> 1632, 0, 51, 0, 1.131
> 1632, 0, 51, 1, 1.131
> 1632, 51, 51, 0, 1.444
> 1632, 51, 51, 1, 1.444
> 1632, 2048, 0, 0, 1.019
> 1632, 2048, 0, 1, 1.019
> 1632, 2099, 0, 0, 0.893
> 1632, 2099, 0, 1, 0.893
> 1632, 2048, 51, 0, 1.079
> 1632, 2048, 51, 1, 1.079
> 1632, 2099, 51, 0, 1.449
> 1632, 2099, 51, 1, 1.449
> 1664, 0, 0, 0, 1.005
> 1664, 0, 0, 1, 1.004
> 1664, 52, 0, 0, 0.986
> 1664, 52, 0, 1, 0.986
> 1664, 0, 52, 0, 1.004
> 1664, 0, 52, 1, 1.004
> 1664, 52, 52, 0, 0.976
> 1664, 52, 52, 1, 0.976
> 1664, 2048, 0, 0, 1.006
> 1664, 2048, 0, 1, 1.006
> 1664, 2100, 0, 0, 0.993
> 1664, 2100, 0, 1, 0.993
> 1664, 2048, 52, 0, 0.946
> 1664, 2048, 52, 1, 0.946
> 1664, 2100, 52, 0, 0.976
> 1664, 2100, 52, 1, 0.976
> 1696, 0, 0, 0, 0.994
> 1696, 0, 0, 1, 0.992
> 1696, 53, 0, 0, 0.884
> 1696, 53, 0, 1, 0.884
> 1696, 0, 53, 0, 1.141
> 1696, 0, 53, 1, 1.141
> 1696, 53, 53, 0, 1.43
> 1696, 53, 53, 1, 1.43
> 1696, 2048, 0, 0, 0.994
> 1696, 2048, 0, 1, 0.994
> 1696, 2101, 0, 0, 0.884
> 1696, 2101, 0, 1, 0.884
> 1696, 2048, 53, 0, 1.088
> 1696, 2048, 53, 1, 1.088
> 1696, 2101, 53, 0, 1.429
> 1696, 2101, 53, 1, 1.429
> 1728, 0, 0, 0, 0.978
> 1728, 0, 0, 1, 0.978
> 1728, 54, 0, 0, 1.031
> 1728, 54, 0, 1, 1.033
> 1728, 0, 54, 0, 1.0
> 1728, 0, 54, 1, 1.0
> 1728, 54, 54, 0, 0.96
> 1728, 54, 54, 1, 0.96
> 1728, 2048, 0, 0, 0.976
> 1728, 2048, 0, 1, 0.976
> 1728, 2102, 0, 0, 1.033
> 1728, 2102, 0, 1, 1.033
> 1728, 2048, 54, 0, 0.947
> 1728, 2048, 54, 1, 0.947
> 1728, 2102, 54, 0, 0.96
> 1728, 2102, 54, 1, 0.96
> 1760, 0, 0, 0, 1.019
> 1760, 0, 0, 1, 1.021
> 1760, 55, 0, 0, 0.9
> 1760, 55, 0, 1, 0.9
> 1760, 0, 55, 0, 1.125
> 1760, 0, 55, 1, 1.125
> 1760, 55, 55, 0, 1.437
> 1760, 55, 55, 1, 1.436
> 1760, 2048, 0, 0, 1.016
> 1760, 2048, 0, 1, 1.015
> 1760, 2103, 0, 0, 0.9
> 1760, 2103, 0, 1, 0.9
> 1760, 2048, 55, 0, 1.073
> 1760, 2048, 55, 1, 1.074
> 1760, 2103, 55, 0, 1.44
> 1760, 2103, 55, 1, 1.44
> 1792, 0, 0, 0, 1.002
> 1792, 0, 0, 1, 1.002
> 1792, 56, 0, 0, 1.028
> 1792, 56, 0, 1, 1.028
> 1792, 0, 56, 0, 1.014
> 1792, 0, 56, 1, 1.015
> 1792, 56, 56, 0, 1.191
> 1792, 56, 56, 1, 1.191
> 1792, 2048, 0, 0, 1.003
> 1792, 2048, 0, 1, 1.003
> 1792, 2104, 0, 0, 1.028
> 1792, 2104, 0, 1, 1.028
> 1792, 2048, 56, 0, 0.963
> 1792, 2048, 56, 1, 0.963
> 1792, 2104, 56, 0, 1.191
> 1792, 2104, 56, 1, 1.191
> 1824, 0, 0, 0, 0.999
> 1824, 0, 0, 1, 1.0
> 1824, 57, 0, 0, 0.891
> 1824, 57, 0, 1, 0.891
> 1824, 0, 57, 0, 1.114
> 1824, 0, 57, 1, 1.114
> 1824, 57, 57, 0, 1.407
> 1824, 57, 57, 1, 1.407
> 1824, 2048, 0, 0, 1.001
> 1824, 2048, 0, 1, 1.001
> 1824, 2105, 0, 0, 0.891
> 1824, 2105, 0, 1, 0.891
> 1824, 2048, 57, 0, 1.064
> 1824, 2048, 57, 1, 1.064
> 1824, 2105, 57, 0, 1.407
> 1824, 2105, 57, 1, 1.407
> 1856, 0, 0, 0, 0.989
> 1856, 0, 0, 1, 0.987
> 1856, 58, 0, 0, 1.042
> 1856, 58, 0, 1, 1.042
> 1856, 0, 58, 0, 1.007
> 1856, 0, 58, 1, 1.007
> 1856, 58, 58, 0, 0.978
> 1856, 58, 58, 1, 0.972
> 1856, 2048, 0, 0, 0.992
> 1856, 2048, 0, 1, 0.992
> 1856, 2106, 0, 0, 1.042
> 1856, 2106, 0, 1, 1.042
> 1856, 2048, 58, 0, 0.954
> 1856, 2048, 58, 1, 0.954
> 1856, 2106, 58, 0, 0.979
> 1856, 2106, 58, 1, 0.972
> 1888, 0, 0, 0, 0.994
> 1888, 0, 0, 1, 0.994
> 1888, 59, 0, 0, 0.883
> 1888, 59, 0, 1, 0.883
> 1888, 0, 59, 0, 1.121
> 1888, 0, 59, 1, 1.123
> 1888, 59, 59, 0, 1.413
> 1888, 59, 59, 1, 1.413
> 1888, 2048, 0, 0, 0.985
> 1888, 2048, 0, 1, 0.994
> 1888, 2107, 0, 0, 0.883
> 1888, 2107, 0, 1, 0.883
> 1888, 2048, 59, 0, 1.076
> 1888, 2048, 59, 1, 1.076
> 1888, 2107, 59, 0, 1.413
> 1888, 2107, 59, 1, 1.413
> 1920, 0, 0, 0, 1.0
> 1920, 0, 0, 1, 0.999
> 1920, 60, 0, 0, 1.033
> 1920, 60, 0, 1, 1.033
> 1920, 0, 60, 0, 0.996
> 1920, 0, 60, 1, 0.997
> 1920, 60, 60, 0, 0.968
> 1920, 60, 60, 1, 0.968
> 1920, 2048, 0, 0, 1.0
> 1920, 2048, 0, 1, 1.0
> 1920, 2108, 0, 0, 1.034
> 1920, 2108, 0, 1, 1.034
> 1920, 2048, 60, 0, 0.949
> 1920, 2048, 60, 1, 0.949
> 1920, 2108, 60, 0, 0.968
> 1920, 2108, 60, 1, 0.968
> 1952, 0, 0, 0, 1.004
> 1952, 0, 0, 1, 1.004
> 1952, 61, 0, 0, 0.898
> 1952, 61, 0, 1, 0.898
> 1952, 0, 61, 0, 1.118
> 1952, 0, 61, 1, 1.118
> 1952, 61, 61, 0, 1.387
> 1952, 61, 61, 1, 1.387
> 1952, 2048, 0, 0, 1.004
> 1952, 2048, 0, 1, 1.004
> 1952, 2109, 0, 0, 0.898
> 1952, 2109, 0, 1, 0.898
> 1952, 2048, 61, 0, 1.071
> 1952, 2048, 61, 1, 1.071
> 1952, 2109, 61, 0, 1.387
> 1952, 2109, 61, 1, 1.387
> 1984, 0, 0, 0, 0.993
> 1984, 0, 0, 1, 0.993
> 1984, 62, 0, 0, 1.025
> 1984, 62, 0, 1, 1.025
> 1984, 0, 62, 0, 1.005
> 1984, 0, 62, 1, 1.007
> 1984, 62, 62, 0, 0.982
> 1984, 62, 62, 1, 0.982
> 1984, 2048, 0, 0, 0.993
> 1984, 2048, 0, 1, 0.993
> 1984, 2110, 0, 0, 1.025
> 1984, 2110, 0, 1, 1.025
> 1984, 2048, 62, 0, 0.96
> 1984, 2048, 62, 1, 0.96
> 1984, 2110, 62, 0, 0.982
> 1984, 2110, 62, 1, 0.982
> 2016, 0, 0, 0, 1.0
> 2016, 0, 0, 1, 0.999
> 2016, 63, 0, 0, 0.889
> 2016, 63, 0, 1, 0.89
> 2016, 0, 63, 0, 1.091
> 2016, 0, 63, 1, 1.092
> 2016, 63, 63, 0, 1.362
> 2016, 63, 63, 1, 1.363
> 2016, 2048, 0, 0, 1.0
> 2016, 2048, 0, 1, 1.0
> 2016, 2111, 0, 0, 0.965
> 2016, 2111, 0, 1, 0.965
> 2016, 2048, 63, 0, 1.049
> 2016, 2048, 63, 1, 1.049
> 2016, 2111, 63, 0, 1.405
> 2016, 2111, 63, 1, 1.405
> 2048, 32, 0, 0, 1.01
> 2048, 32, 0, 1, 1.01
> 2048, 0, 32, 0, 1.005
> 2048, 0, 32, 1, 1.005
> 2048, 32, 32, 0, 1.005
> 2048, 32, 32, 1, 1.005
> 2048, 0, 1, 0, 0.983
> 2048, 0, 1, 1, 0.984
> 2048, 1, 0, 0, 1.039
> 2048, 1, 0, 1, 1.039
> 2048, 32, 1, 0, 1.063
> 2048, 32, 1, 1, 1.063
> 2048, 1, 32, 0, 0.94
> 2048, 1, 32, 1, 0.94
> 2048, 2048, 1, 0, 0.981
> 2048, 2048, 1, 1, 0.981
> 2048, 2049, 0, 0, 0.904
> 2048, 2049, 0, 1, 0.904
> 2112, 0, 0, 0, 0.996
> 2112, 0, 0, 1, 0.995
> 2112, 1, 0, 0, 1.031
> 2112, 1, 0, 1, 1.031
> 2112, 33, 0, 0, 1.01
> 2112, 33, 0, 1, 1.01
> 2112, 0, 1, 0, 0.972
> 2112, 0, 1, 1, 0.972
> 2112, 0, 33, 0, 0.987
> 2112, 0, 33, 1, 0.987
> 2112, 1, 1, 0, 0.914
> 2112, 1, 1, 1, 0.914
> 2112, 33, 33, 0, 0.983
> 2112, 33, 33, 1, 0.983
> 2112, 2048, 0, 0, 0.994
> 2112, 2048, 0, 1, 0.99
> 2112, 2049, 0, 0, 1.031
> 2112, 2049, 0, 1, 1.031
> 2112, 2048, 1, 0, 0.955
> 2112, 2048, 1, 1, 0.955
> 2112, 2049, 1, 0, 0.906
> 2112, 2049, 1, 1, 0.906
> 2112, 33, 1, 0, 1.163
> 2112, 33, 1, 1, 1.164
> 2112, 1, 33, 0, 1.046
> 2112, 1, 33, 1, 1.046
> 2176, 0, 0, 0, 0.984
> 2176, 0, 0, 1, 0.985
> 2176, 2, 0, 0, 1.023
> 2176, 2, 0, 1, 1.023
> 2176, 34, 0, 0, 1.0
> 2176, 34, 0, 1, 1.0
> 2176, 0, 2, 0, 0.985
> 2176, 0, 2, 1, 0.985
> 2176, 0, 34, 0, 0.995
> 2176, 0, 34, 1, 0.982
> 2176, 2, 2, 0, 0.928
> 2176, 2, 2, 1, 0.928
> 2176, 34, 34, 0, 1.004
> 2176, 34, 34, 1, 1.004
> 2176, 2048, 0, 0, 0.985
> 2176, 2048, 0, 1, 0.986
> 2176, 2050, 0, 0, 1.023
> 2176, 2050, 0, 1, 1.023
> 2176, 2048, 2, 0, 0.802
> 2176, 2048, 2, 1, 0.802
> 2176, 2050, 2, 0, 0.894
> 2176, 2050, 2, 1, 0.894
> 2176, 2, 1, 0, 1.068
> 2176, 2, 1, 1, 1.068
> 2176, 1, 2, 0, 0.976
> 2176, 1, 2, 1, 0.976
> 2176, 34, 1, 0, 1.077
> 2176, 34, 1, 1, 1.077
> 2176, 1, 34, 0, 0.978
> 2176, 1, 34, 1, 0.978
> 2176, 2050, 1, 0, 1.061
> 2176, 2050, 1, 1, 1.061
> 2176, 2049, 2, 0, 0.971
> 2176, 2049, 2, 1, 0.971
> 2240, 0, 0, 0, 0.994
> 2240, 0, 0, 1, 0.994
> 2240, 3, 0, 0, 1.038
> 2240, 3, 0, 1, 1.039
> 2240, 35, 0, 0, 1.019
> 2240, 35, 0, 1, 1.019
> 2240, 0, 3, 0, 0.979
> 2240, 0, 3, 1, 0.98
> 2240, 0, 35, 0, 0.991
> 2240, 0, 35, 1, 0.991
> 2240, 3, 3, 0, 0.931
> 2240, 3, 3, 1, 0.931
> 2240, 35, 35, 0, 0.999
> 2240, 35, 35, 1, 0.999
> 2240, 2048, 0, 0, 0.995
> 2240, 2048, 0, 1, 0.995
> 2240, 2051, 0, 0, 1.039
> 2240, 2051, 0, 1, 1.039
> 2240, 2048, 3, 0, 0.799
> 2240, 2048, 3, 1, 0.799
> 2240, 2051, 3, 0, 0.889
> 2240, 2051, 3, 1, 0.889
> 2240, 3, 1, 0, 1.06
> 2240, 3, 1, 1, 1.06
> 2240, 1, 3, 0, 0.968
> 2240, 1, 3, 1, 0.968
> 2240, 35, 1, 0, 1.071
> 2240, 35, 1, 1, 1.071
> 2240, 1, 35, 0, 0.971
> 2240, 1, 35, 1, 0.971
> 2240, 2051, 1, 0, 1.057
> 2240, 2051, 1, 1, 1.057
> 2240, 2049, 3, 0, 0.966
> 2240, 2049, 3, 1, 0.966
> 2304, 0, 0, 0, 0.986
> 2304, 0, 0, 1, 0.986
> 2304, 4, 0, 0, 1.031
> 2304, 4, 0, 1, 1.032
> 2304, 36, 0, 0, 1.011
> 2304, 36, 0, 1, 1.011
> 2304, 0, 4, 0, 0.968
> 2304, 0, 4, 1, 0.969
> 2304, 0, 36, 0, 0.988
> 2304, 0, 36, 1, 0.988
> 2304, 4, 4, 0, 0.93
> 2304, 4, 4, 1, 0.931
> 2304, 36, 36, 0, 0.992
> 2304, 36, 36, 1, 0.992
> 2304, 2048, 0, 0, 0.988
> 2304, 2048, 0, 1, 0.988
> 2304, 2052, 0, 0, 1.032
> 2304, 2052, 0, 1, 1.032
> 2304, 2048, 4, 0, 0.793
> 2304, 2048, 4, 1, 0.793
> 2304, 2052, 4, 0, 0.884
> 2304, 2052, 4, 1, 0.884
> 2304, 4, 1, 0, 0.989
> 2304, 4, 1, 1, 0.989
> 2304, 1, 4, 0, 0.897
> 2304, 1, 4, 1, 0.898
> 2304, 36, 1, 0, 1.057
> 2304, 36, 1, 1, 1.057
> 2304, 1, 36, 0, 0.966
> 2304, 1, 36, 1, 0.966
> 2304, 2052, 1, 0, 1.052
> 2304, 2052, 1, 1, 1.052
> 2304, 2049, 4, 0, 0.955
> 2304, 2049, 4, 1, 0.955
> 2368, 0, 0, 0, 1.0
> 2368, 0, 0, 1, 1.001
> 2368, 5, 0, 0, 1.024
> 2368, 5, 0, 1, 1.025
> 2368, 37, 0, 0, 1.0
> 2368, 37, 0, 1, 1.0
> 2368, 0, 5, 0, 0.98
> 2368, 0, 5, 1, 0.981
> 2368, 0, 37, 0, 0.983
> 2368, 0, 37, 1, 0.98
> 2368, 5, 5, 0, 0.944
> 2368, 5, 5, 1, 0.944
> 2368, 37, 37, 0, 1.003
> 2368, 37, 37, 1, 1.003
> 2368, 2048, 0, 0, 1.002
> 2368, 2048, 0, 1, 1.002
> 2368, 2053, 0, 0, 1.025
> 2368, 2053, 0, 1, 1.025
> 2368, 2048, 5, 0, 0.801
> 2368, 2048, 5, 1, 0.801
> 2368, 2053, 5, 0, 0.907
> 2368, 2053, 5, 1, 0.907
> 2368, 5, 1, 0, 1.071
> 2368, 5, 1, 1, 1.071
> 2368, 1, 5, 0, 0.973
> 2368, 1, 5, 1, 0.973
> 2368, 37, 1, 0, 1.07
> 2368, 37, 1, 1, 1.07
> 2368, 1, 37, 0, 0.974
> 2368, 1, 37, 1, 0.974
> 2368, 2053, 1, 0, 1.065
> 2368, 2053, 1, 1, 1.065
> 2368, 2049, 5, 0, 0.967
> 2368, 2049, 5, 1, 0.967
> 2432, 0, 0, 0, 0.965
> 2432, 0, 0, 1, 1.0
> 2432, 6, 0, 0, 1.038
> 2432, 6, 0, 1, 1.039
> 2432, 38, 0, 0, 1.021
> 2432, 38, 0, 1, 1.021
> 2432, 0, 6, 0, 0.974
> 2432, 0, 6, 1, 0.976
> 2432, 0, 38, 0, 0.986
> 2432, 0, 38, 1, 0.986
> 2432, 6, 6, 0, 0.926
> 2432, 6, 6, 1, 0.926
> 2432, 38, 38, 0, 1.0
> 2432, 38, 38, 1, 1.0
> 2432, 2048, 0, 0, 1.004
> 2432, 2048, 0, 1, 1.004
> 2432, 2054, 0, 0, 1.039
> 2432, 2054, 0, 1, 1.039
> 2432, 2048, 6, 0, 0.797
> 2432, 2048, 6, 1, 0.797
> 2432, 2054, 6, 0, 0.898
> 2432, 2054, 6, 1, 0.898
> 2432, 6, 1, 0, 1.063
> 2432, 6, 1, 1, 1.063
> 2432, 1, 6, 0, 0.965
> 2432, 1, 6, 1, 0.965
> 2432, 38, 1, 0, 1.068
> 2432, 38, 1, 1, 1.068
> 2432, 1, 38, 0, 0.968
> 2432, 1, 38, 1, 0.968
> 2432, 2054, 1, 0, 1.06
> 2432, 2054, 1, 1, 1.06
> 2432, 2049, 6, 0, 0.963
> 2432, 2049, 6, 1, 0.963
> 2496, 0, 0, 0, 1.013
> 2496, 0, 0, 1, 1.013
> 2496, 7, 0, 0, 1.032
> 2496, 7, 0, 1, 1.032
> 2496, 39, 0, 0, 1.013
> 2496, 39, 0, 1, 1.013
> 2496, 0, 7, 0, 0.965
> 2496, 0, 7, 1, 0.965
> 2496, 0, 39, 0, 0.979
> 2496, 0, 39, 1, 0.979
> 2496, 7, 7, 0, 0.925
> 2496, 7, 7, 1, 0.925
> 2496, 39, 39, 0, 0.989
> 2496, 39, 39, 1, 0.989
> 2496, 2048, 0, 0, 1.013
> 2496, 2048, 0, 1, 1.013
> 2496, 2055, 0, 0, 1.032
> 2496, 2055, 0, 1, 1.032
> 2496, 2048, 7, 0, 0.792
> 2496, 2048, 7, 1, 0.792
> 2496, 2055, 7, 0, 0.93
> 2496, 2055, 7, 1, 0.93
> 2496, 7, 1, 0, 0.984
> 2496, 7, 1, 1, 0.984
> 2496, 1, 7, 0, 0.894
> 2496, 1, 7, 1, 0.895
> 2496, 39, 1, 0, 1.054
> 2496, 39, 1, 1, 1.054
> 2496, 1, 39, 0, 0.963
> 2496, 1, 39, 1, 0.963
> 2496, 2055, 1, 0, 1.049
> 2496, 2055, 1, 1, 1.049
> 2496, 2049, 7, 0, 0.953
> 2496, 2049, 7, 1, 0.953
> 2560, 0, 0, 0, 0.991
> 2560, 0, 0, 1, 0.991
> 2560, 8, 0, 0, 1.031
> 2560, 8, 0, 1, 1.032
> 2560, 40, 0, 0, 1.029
> 2560, 40, 0, 1, 1.029
> 2560, 0, 8, 0, 0.992
> 2560, 0, 8, 1, 0.992
> 2560, 0, 40, 0, 0.975
> 2560, 0, 40, 1, 0.984
> 2560, 8, 8, 0, 0.942
> 2560, 8, 8, 1, 0.943
> 2560, 40, 40, 0, 1.139
> 2560, 40, 40, 1, 1.139
> 2560, 2048, 0, 0, 0.993
> 2560, 2048, 0, 1, 0.993
> 2560, 2056, 0, 0, 1.032
> 2560, 2056, 0, 1, 1.032
> 2560, 2048, 8, 0, 0.812
> 2560, 2048, 8, 1, 0.812
> 2560, 2056, 8, 0, 0.912
> 2560, 2056, 8, 1, 0.912
> 2560, 8, 1, 0, 1.068
> 2560, 8, 1, 1, 1.069
> 2560, 1, 8, 0, 0.974
> 2560, 1, 8, 1, 0.974
> 2560, 40, 1, 0, 1.068
> 2560, 40, 1, 1, 1.068
> 2560, 1, 40, 0, 0.996
> 2560, 1, 40, 1, 0.996
> 2560, 2056, 1, 0, 1.063
> 2560, 2056, 1, 1, 1.063
> 2560, 2049, 8, 0, 0.969
> 2560, 2049, 8, 1, 0.969
> 2624, 0, 0, 0, 0.995
> 2624, 0, 0, 1, 0.994
> 2624, 9, 0, 0, 1.015
> 2624, 9, 0, 1, 1.018
> 2624, 41, 0, 0, 1.044
> 2624, 41, 0, 1, 1.044
> 2624, 0, 9, 0, 0.988
> 2624, 0, 9, 1, 0.99
> 2624, 0, 41, 0, 0.989
> 2624, 0, 41, 1, 0.99
> 2624, 9, 9, 0, 0.943
> 2624, 9, 9, 1, 0.943
> 2624, 41, 41, 0, 0.993
> 2624, 41, 41, 1, 0.993
> 2624, 2048, 0, 0, 0.998
> 2624, 2048, 0, 1, 0.998
> 2624, 2057, 0, 0, 1.018
> 2624, 2057, 0, 1, 1.018
> 2624, 2048, 9, 0, 0.81
> 2624, 2048, 9, 1, 0.81
> 2624, 2057, 9, 0, 0.907
> 2624, 2057, 9, 1, 0.907
> 2624, 9, 1, 0, 1.09
> 2624, 9, 1, 1, 1.09
> 2624, 1, 9, 0, 0.967
> 2624, 1, 9, 1, 0.967
> 2624, 41, 1, 0, 1.084
> 2624, 41, 1, 1, 1.085
> 2624, 1, 41, 0, 0.958
> 2624, 1, 41, 1, 0.957
> 2624, 2057, 1, 0, 1.087
> 2624, 2057, 1, 1, 1.087
> 2624, 2049, 9, 0, 0.965
> 2624, 2049, 9, 1, 0.965
> 2688, 0, 0, 0, 0.995
> 2688, 0, 0, 1, 0.995
> 2688, 10, 0, 0, 1.01
> 2688, 10, 0, 1, 1.012
> 2688, 42, 0, 0, 1.036
> 2688, 42, 0, 1, 1.036
> 2688, 0, 10, 0, 0.978
> 2688, 0, 10, 1, 0.979
> 2688, 0, 42, 0, 0.977
> 2688, 0, 42, 1, 0.978
> 2688, 10, 10, 0, 0.942
> 2688, 10, 10, 1, 0.942
> 2688, 42, 42, 0, 0.989
> 2688, 42, 42, 1, 0.989
> 2688, 2048, 0, 0, 0.995
> 2688, 2048, 0, 1, 0.995
> 2688, 2058, 0, 0, 1.012
> 2688, 2058, 0, 1, 1.012
> 2688, 2048, 10, 0, 0.804
> 2688, 2048, 10, 1, 0.804
> 2688, 2058, 10, 0, 0.905
> 2688, 2058, 10, 1, 0.905
> 2688, 10, 1, 0, 0.986
> 2688, 10, 1, 1, 0.987
> 2688, 1, 10, 0, 0.893
> 2688, 1, 10, 1, 0.894
> 2688, 42, 1, 0, 1.054
> 2688, 42, 1, 1, 1.054
> 2688, 1, 42, 0, 0.958
> 2688, 1, 42, 1, 0.958
> 2688, 2058, 1, 0, 1.052
> 2688, 2058, 1, 1, 1.052
> 2688, 2049, 10, 0, 0.954
> 2688, 2049, 10, 1, 0.954
> 2752, 0, 0, 0, 1.0
> 2752, 0, 0, 1, 0.992
> 2752, 11, 0, 0, 0.954
> 2752, 11, 0, 1, 0.954
> 2752, 43, 0, 0, 0.979
> 2752, 43, 0, 1, 0.979
> 2752, 0, 11, 0, 0.939
> 2752, 0, 11, 1, 0.939
> 2752, 0, 43, 0, 0.931
> 2752, 0, 43, 1, 0.932
> 2752, 11, 11, 0, 0.949
> 2752, 11, 11, 1, 0.949
> 2752, 43, 43, 0, 1.007
> 2752, 43, 43, 1, 1.007
> 2752, 2048, 0, 0, 0.993
> 2752, 2048, 0, 1, 0.993
> 2752, 2059, 0, 0, 0.954
> 2752, 2059, 0, 1, 0.954
> 2752, 2048, 11, 0, 0.77
> 2752, 2048, 11, 1, 0.77
> 2752, 2059, 11, 0, 0.916
> 2752, 2059, 11, 1, 0.916
> 2752, 11, 1, 0, 0.994
> 2752, 11, 1, 1, 0.994
> 2752, 1, 11, 0, 0.928
> 2752, 1, 11, 1, 0.928
> 2752, 43, 1, 0, 1.022
> 2752, 43, 1, 1, 1.022
> 2752, 1, 43, 0, 0.92
> 2752, 1, 43, 1, 0.92
> 2752, 2059, 1, 0, 0.989
> 2752, 2059, 1, 1, 0.989
> 2752, 2049, 11, 0, 0.923
> 2752, 2049, 11, 1, 0.923
> 2816, 0, 0, 0, 1.003
> 2816, 0, 0, 1, 1.003
> 2816, 12, 0, 0, 0.897
> 2816, 12, 0, 1, 0.894
> 2816, 44, 0, 0, 0.914
> 2816, 44, 0, 1, 0.914
> 2816, 0, 12, 0, 0.876
> 2816, 0, 12, 1, 0.874
> 2816, 0, 44, 0, 0.871
> 2816, 0, 44, 1, 0.87
> 2816, 12, 12, 0, 0.948
> 2816, 12, 12, 1, 0.948
> 2816, 44, 44, 0, 1.009
> 2816, 44, 44, 1, 1.009
> 2816, 2048, 0, 0, 1.005
> 2816, 2048, 0, 1, 1.005
> 2816, 2060, 0, 0, 0.894
> 2816, 2060, 0, 1, 0.894
> 2816, 2048, 12, 0, 0.714
> 2816, 2048, 12, 1, 0.713
> 2816, 2060, 12, 0, 0.915
> 2816, 2060, 12, 1, 0.915
> 2816, 12, 1, 0, 0.917
> 2816, 12, 1, 1, 0.917
> 2816, 1, 12, 0, 0.858
> 2816, 1, 12, 1, 0.857
> 2816, 44, 1, 0, 0.944
> 2816, 44, 1, 1, 0.943
> 2816, 1, 44, 0, 0.856
> 2816, 1, 44, 1, 0.856
> 2816, 2060, 1, 0, 0.914
> 2816, 2060, 1, 1, 0.914
> 2816, 2049, 12, 0, 0.855
> 2816, 2049, 12, 1, 0.855
> 2880, 0, 0, 0, 0.989
> 2880, 0, 0, 1, 0.989
> 2880, 13, 0, 0, 0.967
> 2880, 13, 0, 1, 0.967
> 2880, 45, 0, 0, 0.987
> 2880, 45, 0, 1, 0.987
> 2880, 0, 13, 0, 0.925
> 2880, 0, 13, 1, 0.925
> 2880, 0, 45, 0, 0.927
> 2880, 0, 45, 1, 0.927
> 2880, 13, 13, 0, 0.944
> 2880, 13, 13, 1, 0.944
> 2880, 45, 45, 0, 1.003
> 2880, 45, 45, 1, 1.003
> 2880, 2048, 0, 0, 0.989
> 2880, 2048, 0, 1, 0.989
> 2880, 2061, 0, 0, 0.967
> 2880, 2061, 0, 1, 0.967
> 2880, 2048, 13, 0, 0.76
> 2880, 2048, 13, 1, 0.76
> 2880, 2061, 13, 0, 0.91
> 2880, 2061, 13, 1, 0.91
> 2880, 13, 1, 0, 0.922
> 2880, 13, 1, 1, 0.922
> 2880, 1, 13, 0, 0.859
> 2880, 1, 13, 1, 0.859
> 2880, 45, 1, 0, 1.013
> 2880, 45, 1, 1, 1.013
> 2880, 1, 45, 0, 0.92
> 2880, 1, 45, 1, 0.92
> 2880, 2061, 1, 0, 0.984
> 2880, 2061, 1, 1, 0.984
> 2880, 2049, 13, 0, 0.918
> 2880, 2049, 13, 1, 0.918
> 2944, 0, 0, 0, 1.014
> 2944, 0, 0, 1, 1.014
> 2944, 14, 0, 0, 0.956
> 2944, 14, 0, 1, 0.955
> 2944, 46, 0, 0, 0.979
> 2944, 46, 0, 1, 0.979
> 2944, 0, 14, 0, 0.937
> 2944, 0, 14, 1, 0.937
> 2944, 0, 46, 0, 0.93
> 2944, 0, 46, 1, 0.93
> 2944, 14, 14, 0, 0.953
> 2944, 14, 14, 1, 0.953
> 2944, 46, 46, 0, 1.009
> 2944, 46, 46, 1, 1.009
> 2944, 2048, 0, 0, 1.015
> 2944, 2048, 0, 1, 1.015
> 2944, 2062, 0, 0, 0.955
> 2944, 2062, 0, 1, 0.955
> 2944, 2048, 14, 0, 0.769
> 2944, 2048, 14, 1, 0.769
> 2944, 2062, 14, 0, 0.923
> 2944, 2062, 14, 1, 0.923
> 2944, 14, 1, 0, 0.994
> 2944, 14, 1, 1, 0.994
> 2944, 1, 14, 0, 0.927
> 2944, 1, 14, 1, 0.927
> 2944, 46, 1, 0, 1.021
> 2944, 46, 1, 1, 1.021
> 2944, 1, 46, 0, 0.923
> 2944, 1, 46, 1, 0.923
> 2944, 2062, 1, 0, 0.988
> 2944, 2062, 1, 1, 0.988
> 2944, 2049, 14, 0, 0.922
> 2944, 2049, 14, 1, 0.922
> 3008, 0, 0, 0, 0.994
> 3008, 0, 0, 1, 0.994
> 3008, 15, 0, 0, 0.941
> 3008, 15, 0, 1, 0.941
> 3008, 47, 0, 0, 0.996
> 3008, 47, 0, 1, 0.996
> 3008, 0, 15, 0, 0.929
> 3008, 0, 15, 1, 0.933
> 3008, 0, 47, 0, 0.933
> 3008, 0, 47, 1, 0.933
> 3008, 15, 15, 0, 0.952
> 3008, 15, 15, 1, 0.949
> 3008, 47, 47, 0, 1.003
> 3008, 47, 47, 1, 1.003
> 3008, 2048, 0, 0, 0.998
> 3008, 2048, 0, 1, 0.998
> 3008, 2063, 0, 0, 0.941
> 3008, 2063, 0, 1, 0.941
> 3008, 2048, 15, 0, 0.766
> 3008, 2048, 15, 1, 0.766
> 3008, 2063, 15, 0, 0.916
> 3008, 2063, 15, 1, 0.916
> 3008, 15, 1, 0, 0.985
> 3008, 15, 1, 1, 0.985
> 3008, 1, 15, 0, 0.916
> 3008, 1, 15, 1, 0.916
> 3008, 47, 1, 0, 1.014
> 3008, 47, 1, 1, 1.014
> 3008, 1, 47, 0, 0.902
> 3008, 1, 47, 1, 0.902
> 3008, 2063, 1, 0, 0.981
> 3008, 2063, 1, 1, 0.981
> 3008, 2049, 15, 0, 0.912
> 3008, 2049, 15, 1, 0.913
> 3072, 0, 0, 0, 1.016
> 3072, 0, 0, 1, 1.015
> 3072, 16, 0, 0, 1.045
> 3072, 16, 0, 1, 1.045
> 3072, 48, 0, 0, 1.045
> 3072, 48, 0, 1, 1.045
> 3072, 0, 16, 0, 1.049
> 3072, 0, 16, 1, 1.049
> 3072, 0, 48, 0, 1.049
> 3072, 0, 48, 1, 1.049
> 3072, 16, 16, 0, 1.016
> 3072, 16, 16, 1, 1.016
> 3072, 48, 48, 0, 1.016
> 3072, 48, 48, 1, 1.016
> 3072, 2048, 0, 0, 1.016
> 3072, 2048, 0, 1, 1.016
> 3072, 2064, 0, 0, 1.045
> 3072, 2064, 0, 1, 1.045
> 3072, 2048, 16, 0, 1.049
> 3072, 2048, 16, 1, 1.049
> 3072, 2064, 16, 0, 1.016
> 3072, 2064, 16, 1, 1.016
> 3072, 16, 1, 0, 0.815
> 3072, 16, 1, 1, 0.815
> 3072, 1, 16, 0, 0.872
> 3072, 1, 16, 1, 0.872
> 3072, 48, 1, 0, 1.017
> 3072, 48, 1, 1, 1.017
> 3072, 1, 48, 0, 0.872
> 3072, 1, 48, 1, 0.872
> 3072, 2064, 1, 0, 0.815
> 3072, 2064, 1, 1, 0.815
> 3072, 2049, 16, 0, 0.872
> 3072, 2049, 16, 1, 0.872
> 3136, 0, 0, 0, 0.995
> 3136, 0, 0, 1, 0.995
> 3136, 17, 0, 0, 0.949
> 3136, 17, 0, 1, 0.949
> 3136, 49, 0, 0, 0.987
> 3136, 49, 0, 1, 0.987
> 3136, 0, 17, 0, 0.919
> 3136, 0, 17, 1, 0.917
> 3136, 0, 49, 0, 0.931
> 3136, 0, 49, 1, 0.931
> 3136, 17, 17, 0, 1.122
> 3136, 17, 17, 1, 1.119
> 3136, 49, 49, 0, 0.987
> 3136, 49, 49, 1, 0.987
> 3136, 2048, 0, 0, 0.997
> 3136, 2048, 0, 1, 0.997
> 3136, 2065, 0, 0, 0.949
> 3136, 2065, 0, 1, 0.949
> 3136, 2048, 17, 0, 0.896
> 3136, 2048, 17, 1, 0.896
> 3136, 2065, 17, 0, 1.122
> 3136, 2065, 17, 1, 1.119
> 3136, 17, 1, 0, 1.184
> 3136, 17, 1, 1, 1.184
> 3136, 1, 17, 0, 1.124
> 3136, 1, 17, 1, 1.125
> 3136, 49, 1, 0, 1.11
> 3136, 49, 1, 1, 1.108
> 3136, 1, 49, 0, 1.044
> 3136, 1, 49, 1, 1.044
> 3136, 2065, 1, 0, 1.147
> 3136, 2065, 1, 1, 1.147
> 3136, 2049, 17, 0, 1.102
> 3136, 2049, 17, 1, 1.1
> 3200, 0, 0, 0, 1.006
> 3200, 0, 0, 1, 1.006
> 3200, 18, 0, 0, 0.978
> 3200, 18, 0, 1, 0.978
> 3200, 50, 0, 0, 0.998
> 3200, 50, 0, 1, 0.998
> 3200, 0, 18, 0, 0.932
> 3200, 0, 18, 1, 0.932
> 3200, 0, 50, 0, 0.93
> 3200, 0, 50, 1, 0.93
> 3200, 18, 18, 0, 1.11
> 3200, 18, 18, 1, 1.11
> 3200, 50, 50, 0, 0.994
> 3200, 50, 50, 1, 0.994
> 3200, 2048, 0, 0, 1.007
> 3200, 2048, 0, 1, 1.007
> 3200, 2066, 0, 0, 0.978
> 3200, 2066, 0, 1, 0.978
> 3200, 2048, 18, 0, 0.894
> 3200, 2048, 18, 1, 0.894
> 3200, 2066, 18, 0, 1.11
> 3200, 2066, 18, 1, 1.11
> 3200, 18, 1, 0, 1.002
> 3200, 18, 1, 1, 1.002
> 3200, 1, 18, 0, 0.917
> 3200, 1, 18, 1, 0.917
> 3200, 50, 1, 0, 0.963
> 3200, 50, 1, 1, 0.964
> 3200, 1, 50, 0, 0.888
> 3200, 1, 50, 1, 0.888
> 3200, 2066, 1, 0, 1.002
> 3200, 2066, 1, 1, 1.002
> 3200, 2049, 18, 0, 0.914
> 3200, 2049, 18, 1, 0.914
> 3264, 0, 0, 0, 0.994
> 3264, 0, 0, 1, 0.994
> 3264, 19, 0, 0, 0.959
> 3264, 19, 0, 1, 0.959
> 3264, 51, 0, 0, 0.994
> 3264, 51, 0, 1, 0.994
> 3264, 0, 19, 0, 0.927
> 3264, 0, 19, 1, 0.927
> 3264, 0, 51, 0, 0.927
> 3264, 0, 51, 1, 0.927
> 3264, 19, 19, 0, 1.1
> 3264, 19, 19, 1, 1.1
> 3264, 51, 51, 0, 0.982
> 3264, 51, 51, 1, 0.982
> 3264, 2048, 0, 0, 0.994
> 3264, 2048, 0, 1, 0.994
> 3264, 2067, 0, 0, 0.959
> 3264, 2067, 0, 1, 0.959
> 3264, 2048, 19, 0, 0.891
> 3264, 2048, 19, 1, 0.891
> 3264, 2067, 19, 0, 1.099
> 3264, 2067, 19, 1, 1.099
> 3264, 19, 1, 0, 0.977
> 3264, 19, 1, 1, 0.976
> 3264, 1, 19, 0, 0.921
> 3264, 1, 19, 1, 0.921
> 3264, 51, 1, 0, 0.959
> 3264, 51, 1, 1, 0.959
> 3264, 1, 51, 0, 0.886
> 3264, 1, 51, 1, 0.886
> 3264, 2067, 1, 0, 0.976
> 3264, 2067, 1, 1, 0.976
> 3264, 2049, 19, 0, 0.917
> 3264, 2049, 19, 1, 0.917
> 3328, 0, 0, 0, 0.996
> 3328, 0, 0, 1, 0.992
> 3328, 20, 0, 0, 0.955
> 3328, 20, 0, 1, 0.955
> 3328, 52, 0, 0, 0.99
> 3328, 52, 0, 1, 0.99
> 3328, 0, 20, 0, 0.926
> 3328, 0, 20, 1, 0.923
> 3328, 0, 52, 0, 0.933
> 3328, 0, 52, 1, 0.933
> 3328, 20, 20, 0, 1.11
> 3328, 20, 20, 1, 1.11
> 3328, 52, 52, 0, 0.988
> 3328, 52, 52, 1, 0.988
> 3328, 2048, 0, 0, 0.993
> 3328, 2048, 0, 1, 0.993
> 3328, 2068, 0, 0, 0.955
> 3328, 2068, 0, 1, 0.955
> 3328, 2048, 20, 0, 0.9
> 3328, 2048, 20, 1, 0.9
> 3328, 2068, 20, 0, 1.109
> 3328, 2068, 20, 1, 1.109
> 3328, 20, 1, 0, 0.99
> 3328, 20, 1, 1, 0.99
> 3328, 1, 20, 0, 0.922
> 3328, 1, 20, 1, 0.922
> 3328, 52, 1, 0, 0.972
> 3328, 52, 1, 1, 0.972
> 3328, 1, 52, 0, 0.901
> 3328, 1, 52, 1, 0.901
> 3328, 2068, 1, 0, 0.99
> 3328, 2068, 1, 1, 0.99
> 3328, 2049, 20, 0, 0.918
> 3328, 2049, 20, 1, 0.918
> 3392, 0, 0, 0, 0.998
> 3392, 0, 0, 1, 1.0
> 3392, 21, 0, 0, 0.964
> 3392, 21, 0, 1, 0.964
> 3392, 53, 0, 0, 0.998
> 3392, 53, 0, 1, 0.998
> 3392, 0, 21, 0, 0.932
> 3392, 0, 21, 1, 0.932
> 3392, 0, 53, 0, 0.93
> 3392, 0, 53, 1, 0.93
> 3392, 21, 21, 0, 1.113
> 3392, 21, 21, 1, 1.113
> 3392, 53, 53, 0, 0.983
> 3392, 53, 53, 1, 0.983
> 3392, 2048, 0, 0, 1.0
> 3392, 2048, 0, 1, 1.0
> 3392, 2069, 0, 0, 0.964
> 3392, 2069, 0, 1, 0.964
> 3392, 2048, 21, 0, 0.895
> 3392, 2048, 21, 1, 0.896
> 3392, 2069, 21, 0, 1.113
> 3392, 2069, 21, 1, 1.113
> 3392, 21, 1, 0, 0.994
> 3392, 21, 1, 1, 0.994
> 3392, 1, 21, 0, 0.923
> 3392, 1, 21, 1, 0.923
> 3392, 53, 1, 0, 0.972
> 3392, 53, 1, 1, 0.972
> 3392, 1, 53, 0, 0.891
> 3392, 1, 53, 1, 0.891
> 3392, 2069, 1, 0, 0.994
> 3392, 2069, 1, 1, 0.994
> 3392, 2049, 21, 0, 0.922
> 3392, 2049, 21, 1, 0.922
> 3456, 0, 0, 0, 0.995
> 3456, 0, 0, 1, 0.995
> 3456, 22, 0, 0, 0.965
> 3456, 22, 0, 1, 0.965
> 3456, 54, 0, 0, 0.996
> 3456, 54, 0, 1, 0.996
> 3456, 0, 22, 0, 0.927
> 3456, 0, 22, 1, 0.927
> 3456, 0, 54, 0, 0.927
> 3456, 0, 54, 1, 0.927
> 3456, 22, 22, 0, 1.107
> 3456, 22, 22, 1, 1.107
> 3456, 54, 54, 0, 0.98
> 3456, 54, 54, 1, 0.98
> 3456, 2048, 0, 0, 0.995
> 3456, 2048, 0, 1, 0.995
> 3456, 2070, 0, 0, 0.965
> 3456, 2070, 0, 1, 0.965
> 3456, 2048, 22, 0, 0.893
> 3456, 2048, 22, 1, 0.893
> 3456, 2070, 22, 0, 1.107
> 3456, 2070, 22, 1, 1.107
> 3456, 22, 1, 0, 0.988
> 3456, 22, 1, 1, 0.988
> 3456, 1, 22, 0, 0.921
> 3456, 1, 22, 1, 0.921
> 3456, 54, 1, 0, 0.963
> 3456, 54, 1, 1, 0.963
> 3456, 1, 54, 0, 0.887
> 3456, 1, 54, 1, 0.887
> 3456, 2070, 1, 0, 0.988
> 3456, 2070, 1, 1, 0.988
> 3456, 2049, 22, 0, 0.917
> 3456, 2049, 22, 1, 0.917
> 3520, 0, 0, 0, 1.016
> 3520, 0, 0, 1, 1.016
> 3520, 23, 0, 0, 0.957
> 3520, 23, 0, 1, 0.957
> 3520, 55, 0, 0, 0.991
> 3520, 55, 0, 1, 0.991
> 3520, 0, 23, 0, 0.919
> 3520, 0, 23, 1, 0.924
> 3520, 0, 55, 0, 0.934
> 3520, 0, 55, 1, 0.934
> 3520, 23, 23, 0, 1.111
> 3520, 23, 23, 1, 1.111
> 3520, 55, 55, 0, 0.994
> 3520, 55, 55, 1, 0.994
> 3520, 2048, 0, 0, 1.016
> 3520, 2048, 0, 1, 1.016
> 3520, 2071, 0, 0, 0.957
> 3520, 2071, 0, 1, 0.957
> 3520, 2048, 23, 0, 0.903
> 3520, 2048, 23, 1, 0.903
> 3520, 2071, 23, 0, 1.111
> 3520, 2071, 23, 1, 1.111
> 3520, 23, 1, 0, 0.997
> 3520, 23, 1, 1, 0.997
> 3520, 1, 23, 0, 0.921
> 3520, 1, 23, 1, 0.921
> 3520, 55, 1, 0, 0.976
> 3520, 55, 1, 1, 0.976
> 3520, 1, 55, 0, 0.902
> 3520, 1, 55, 1, 0.902
> 3520, 2071, 1, 0, 0.997
> 3520, 2071, 1, 1, 0.997
> 3520, 2049, 23, 0, 0.918
> 3520, 2049, 23, 1, 0.918
> 3584, 0, 0, 0, 1.004
> 3584, 0, 0, 1, 1.004
> 3584, 24, 0, 0, 0.985
> 3584, 24, 0, 1, 0.979
> 3584, 56, 0, 0, 1.006
> 3584, 56, 0, 1, 1.006
> 3584, 0, 24, 0, 0.931
> 3584, 0, 24, 1, 0.931
> 3584, 0, 56, 0, 0.93
> 3584, 0, 56, 1, 0.93
> 3584, 24, 24, 0, 1.111
> 3584, 24, 24, 1, 1.11
> 3584, 56, 56, 0, 1.101
> 3584, 56, 56, 1, 1.1
> 3584, 2048, 0, 0, 1.005
> 3584, 2048, 0, 1, 1.005
> 3584, 2072, 0, 0, 0.98
> 3584, 2072, 0, 1, 0.978
> 3584, 2048, 24, 0, 0.896
> 3584, 2048, 24, 1, 0.897
> 3584, 2072, 24, 0, 1.111
> 3584, 2072, 24, 1, 1.111
> 3584, 24, 1, 0, 1.004
> 3584, 24, 1, 1, 1.004
> 3584, 1, 24, 0, 0.921
> 3584, 1, 24, 1, 0.921
> 3584, 56, 1, 0, 0.971
> 3584, 56, 1, 1, 0.97
> 3584, 1, 56, 0, 0.89
> 3584, 1, 56, 1, 0.89
> 3584, 2072, 1, 0, 1.004
> 3584, 2072, 1, 1, 1.004
> 3584, 2049, 24, 0, 0.918
> 3584, 2049, 24, 1, 0.918
> 3648, 0, 0, 0, 1.012
> 3648, 0, 0, 1, 1.012
> 3648, 25, 0, 0, 0.96
> 3648, 25, 0, 1, 0.96
> 3648, 57, 0, 0, 0.988
> 3648, 57, 0, 1, 0.988
> 3648, 0, 25, 0, 0.927
> 3648, 0, 25, 1, 0.927
> 3648, 0, 57, 0, 0.927
> 3648, 0, 57, 1, 0.927
> 3648, 25, 25, 0, 1.101
> 3648, 25, 25, 1, 1.101
> 3648, 57, 57, 0, 0.986
> 3648, 57, 57, 1, 0.986
> 3648, 2048, 0, 0, 1.012
> 3648, 2048, 0, 1, 1.012
> 3648, 2073, 0, 0, 0.96
> 3648, 2073, 0, 1, 0.959
> 3648, 2048, 25, 0, 0.894
> 3648, 2048, 25, 1, 0.895
> 3648, 2073, 25, 0, 1.103
> 3648, 2073, 25, 1, 1.103
> 3648, 25, 1, 0, 1.024
> 3648, 25, 1, 1, 1.024
> 3648, 1, 25, 0, 0.911
> 3648, 1, 25, 1, 0.912
> 3648, 57, 1, 0, 0.973
> 3648, 57, 1, 1, 0.974
> 3648, 1, 57, 0, 0.888
> 3648, 1, 57, 1, 0.888
> 3648, 2073, 1, 0, 1.024
> 3648, 2073, 1, 1, 1.024
> 3648, 2049, 25, 0, 0.907
> 3648, 2049, 25, 1, 0.907
> 3712, 0, 0, 0, 0.996
> 3712, 0, 0, 1, 0.996
> 3712, 26, 0, 0, 0.96
> 3712, 26, 0, 1, 0.96
> 3712, 58, 0, 0, 0.995
> 3712, 58, 0, 1, 0.995
> 3712, 0, 26, 0, 0.919
> 3712, 0, 26, 1, 0.918
> 3712, 0, 58, 0, 0.93
> 3712, 0, 58, 1, 0.93
> 3712, 26, 26, 0, 1.103
> 3712, 26, 26, 1, 1.102
> 3712, 58, 58, 0, 0.989
> 3712, 58, 58, 1, 0.989
> 3712, 2048, 0, 0, 0.997
> 3712, 2048, 0, 1, 0.997
> 3712, 2074, 0, 0, 0.959
> 3712, 2074, 0, 1, 0.959
> 3712, 2048, 26, 0, 0.901
> 3712, 2048, 26, 1, 0.901
> 3712, 2074, 26, 0, 1.104
> 3712, 2074, 26, 1, 1.102
> 3712, 26, 1, 0, 1.001
> 3712, 26, 1, 1, 1.001
> 3712, 1, 26, 0, 0.922
> 3712, 1, 26, 1, 0.922
> 3712, 58, 1, 0, 0.974
> 3712, 58, 1, 1, 0.974
> 3712, 1, 58, 0, 0.903
> 3712, 1, 58, 1, 0.903
> 3712, 2074, 1, 0, 1.001
> 3712, 2074, 1, 1, 1.001
> 3712, 2049, 26, 0, 0.919
> 3712, 2049, 26, 1, 0.919
> 3776, 0, 0, 0, 1.003
> 3776, 0, 0, 1, 1.003
> 3776, 27, 0, 0, 0.964
> 3776, 27, 0, 1, 0.964
> 3776, 59, 0, 0, 1.004
> 3776, 59, 0, 1, 1.004
> 3776, 0, 27, 0, 0.931
> 3776, 0, 27, 1, 0.931
> 3776, 0, 59, 0, 0.929
> 3776, 0, 59, 1, 0.93
> 3776, 27, 27, 0, 1.097
> 3776, 27, 27, 1, 1.097
> 3776, 59, 59, 0, 0.992
> 3776, 59, 59, 1, 0.992
> 3776, 2048, 0, 0, 1.003
> 3776, 2048, 0, 1, 1.003
> 3776, 2075, 0, 0, 0.963
> 3776, 2075, 0, 1, 0.964
> 3776, 2048, 27, 0, 0.898
> 3776, 2048, 27, 1, 0.898
> 3776, 2075, 27, 0, 1.097
> 3776, 2075, 27, 1, 1.097
> 3776, 27, 1, 0, 0.998
> 3776, 27, 1, 1, 0.998
> 3776, 1, 27, 0, 0.925
> 3776, 1, 27, 1, 0.925
> 3776, 59, 1, 0, 0.979
> 3776, 59, 1, 1, 0.979
> 3776, 1, 59, 0, 0.894
> 3776, 1, 59, 1, 0.894
> 3776, 2075, 1, 0, 0.998
> 3776, 2075, 1, 1, 0.999
> 3776, 2049, 27, 0, 0.923
> 3776, 2049, 27, 1, 0.923
> 3840, 0, 0, 0, 0.997
> 3840, 0, 0, 1, 0.997
> 3840, 28, 0, 0, 0.968
> 3840, 28, 0, 1, 0.968
> 3840, 60, 0, 0, 1.001
> 3840, 60, 0, 1, 1.001
> 3840, 0, 28, 0, 0.926
> 3840, 0, 28, 1, 0.927
> 3840, 0, 60, 0, 0.927
> 3840, 0, 60, 1, 0.927
> 3840, 28, 28, 0, 1.094
> 3840, 28, 28, 1, 1.094
> 3840, 60, 60, 0, 0.982
> 3840, 60, 60, 1, 0.982
> 3840, 2048, 0, 0, 0.998
> 3840, 2048, 0, 1, 0.998
> 3840, 2076, 0, 0, 0.968
> 3840, 2076, 0, 1, 0.968
> 3840, 2048, 28, 0, 0.896
> 3840, 2048, 28, 1, 0.896
> 3840, 2076, 28, 0, 1.094
> 3840, 2076, 28, 1, 1.094
> 3840, 28, 1, 0, 0.983
> 3840, 28, 1, 1, 0.982
> 3840, 1, 28, 0, 0.916
> 3840, 1, 28, 1, 0.916
> 3840, 60, 1, 0, 0.969
> 3840, 60, 1, 1, 0.969
> 3840, 1, 60, 0, 0.891
> 3840, 1, 60, 1, 0.891
> 3840, 2076, 1, 0, 0.983
> 3840, 2076, 1, 1, 0.983
> 3840, 2049, 28, 0, 0.912
> 3840, 2049, 28, 1, 0.912
> 3904, 0, 0, 0, 1.002
> 3904, 0, 0, 1, 1.0
> 3904, 29, 0, 0, 0.961
> 3904, 29, 0, 1, 0.961
> 3904, 61, 0, 0, 0.997
> 3904, 61, 0, 1, 0.997
> 3904, 0, 29, 0, 0.915
> 3904, 0, 29, 1, 0.922
> 3904, 0, 61, 0, 0.933
> 3904, 0, 61, 1, 0.933
> 3904, 29, 29, 0, 1.103
> 3904, 29, 29, 1, 1.103
> 3904, 61, 61, 0, 0.995
> 3904, 61, 61, 1, 0.995
> 3904, 2048, 0, 0, 0.998
> 3904, 2048, 0, 1, 1.0
> 3904, 2077, 0, 0, 0.961
> 3904, 2077, 0, 1, 0.961
> 3904, 2048, 29, 0, 0.904
> 3904, 2048, 29, 1, 0.904
> 3904, 2077, 29, 0, 1.103
> 3904, 2077, 29, 1, 1.103
> 3904, 29, 1, 0, 1.0
> 3904, 29, 1, 1, 1.0
> 3904, 1, 29, 0, 0.922
> 3904, 1, 29, 1, 0.922
> 3904, 61, 1, 0, 0.98
> 3904, 61, 1, 1, 0.98
> 3904, 1, 61, 0, 0.904
> 3904, 1, 61, 1, 0.904
> 3904, 2077, 1, 0, 1.0
> 3904, 2077, 1, 1, 1.0
> 3904, 2049, 29, 0, 0.919
> 3904, 2049, 29, 1, 0.919
> 3968, 0, 0, 0, 1.003
> 3968, 0, 0, 1, 1.003
> 3968, 30, 0, 0, 0.969
> 3968, 30, 0, 1, 0.969
> 3968, 62, 0, 0, 1.006
> 3968, 62, 0, 1, 1.006
> 3968, 0, 30, 0, 0.931
> 3968, 0, 30, 1, 0.93
> 3968, 0, 62, 0, 0.929
> 3968, 0, 62, 1, 0.929
> 3968, 30, 30, 0, 1.103
> 3968, 30, 30, 1, 1.103
> 3968, 62, 62, 0, 0.99
> 3968, 62, 62, 1, 0.99
> 3968, 2048, 0, 0, 1.004
> 3968, 2048, 0, 1, 1.004
> 3968, 2078, 0, 0, 0.969
> 3968, 2078, 0, 1, 0.969
> 3968, 2048, 30, 0, 0.899
> 3968, 2048, 30, 1, 0.899
> 3968, 2078, 30, 0, 1.105
> 3968, 2078, 30, 1, 1.105
> 3968, 30, 1, 0, 0.993
> 3968, 30, 1, 1, 0.993
> 3968, 1, 30, 0, 0.908
> 3968, 1, 30, 1, 0.908
> 3968, 62, 1, 0, 0.978
> 3968, 62, 1, 1, 0.978
> 3968, 1, 62, 0, 0.895
> 3968, 1, 62, 1, 0.895
> 3968, 2078, 1, 0, 0.993
> 3968, 2078, 1, 1, 0.993
> 3968, 2049, 30, 0, 0.904
> 3968, 2049, 30, 1, 0.904
> 4032, 0, 0, 0, 0.995
> 4032, 0, 0, 1, 0.995
> 4032, 31, 0, 0, 0.967
> 4032, 31, 0, 1, 0.967
> 4032, 63, 0, 0, 1.002
> 4032, 63, 0, 1, 1.002
> 4032, 0, 31, 0, 0.927
> 4032, 0, 31, 1, 0.926
> 4032, 0, 63, 0, 0.927
> 4032, 0, 63, 1, 0.927
> 4032, 31, 31, 0, 1.09
> 4032, 31, 31, 1, 1.09
> 4032, 63, 63, 0, 0.987
> 4032, 63, 63, 1, 0.987
> 4032, 2048, 0, 0, 0.995
> 4032, 2048, 0, 1, 0.995
> 4032, 2079, 0, 0, 0.967
> 4032, 2079, 0, 1, 0.967
> 4032, 2048, 31, 0, 0.897
> 4032, 2048, 31, 1, 0.897
> 4032, 2079, 31, 0, 1.09
> 4032, 2079, 31, 1, 1.09
> 4032, 31, 1, 0, 0.989
> 4032, 31, 1, 1, 0.989
> 4032, 1, 31, 0, 0.911
> 4032, 1, 31, 1, 0.911
> 4032, 63, 1, 0, 0.971
> 4032, 63, 1, 1, 0.972
> 4032, 1, 63, 0, 0.892
> 4032, 1, 63, 1, 0.892
> 4032, 2079, 1, 0, 0.989
> 4032, 2079, 1, 1, 0.989
> 4032, 2049, 31, 0, 0.907
> 4032, 2049, 31, 1, 0.907
> 4096, 32, 0, 0, 1.014
> 4096, 32, 0, 1, 1.014
> 4096, 64, 0, 0, 1.014
> 4096, 64, 0, 1, 1.014
> 4096, 0, 32, 0, 1.012
> 4096, 0, 32, 1, 1.012
> 4096, 0, 64, 0, 1.012
> 4096, 0, 64, 1, 1.012
> 4096, 32, 32, 0, 1.014
> 4096, 32, 32, 1, 1.014
> 4096, 64, 64, 0, 1.014
> 4096, 64, 64, 1, 1.014
> 4096, 2080, 0, 0, 1.014
> 4096, 2080, 0, 1, 1.014
> 4096, 2048, 32, 0, 1.014
> 4096, 2048, 32, 1, 1.014
> 4096, 2080, 32, 0, 1.014
> 4096, 2080, 32, 1, 1.014
> 4096, 32, 1, 0, 0.975
> 4096, 32, 1, 1, 0.975
> 4096, 1, 32, 0, 0.769
> 4096, 1, 32, 1, 0.769
> 4096, 64, 1, 0, 0.858
> 4096, 64, 1, 1, 0.858
> 4096, 1, 64, 0, 0.769
> 4096, 1, 64, 1, 0.769
> 4096, 2080, 1, 0, 0.829
> 4096, 2080, 1, 1, 0.829
> 4096, 2049, 32, 0, 0.886
> 4096, 2049, 32, 1, 0.886
> 4160, 0, 0, 0, 1.003
> 4160, 0, 0, 1, 1.003
> 4160, 33, 0, 0, 1.004
> 4160, 33, 0, 1, 1.004
> 4160, 65, 0, 0, 0.999
> 4160, 65, 0, 1, 0.999
> 4160, 0, 33, 0, 0.931
> 4160, 0, 33, 1, 0.931
> 4160, 0, 65, 0, 0.765
> 4160, 0, 65, 1, 0.765
> 4160, 33, 33, 0, 0.998
> 4160, 33, 33, 1, 0.998
> 4160, 65, 65, 0, 0.942
> 4160, 65, 65, 1, 0.942
> 4160, 2048, 0, 0, 1.003
> 4160, 2048, 0, 1, 1.003
> 4160, 2081, 0, 0, 1.004
> 4160, 2081, 0, 1, 1.004
> 4160, 2048, 33, 0, 0.899
> 4160, 2048, 33, 1, 0.898
> 4160, 2081, 33, 0, 1.002
> 4160, 2081, 33, 1, 1.002
> 4160, 33, 1, 0, 1.114
> 4160, 33, 1, 1, 1.114
> 4160, 1, 33, 0, 1.01
> 4160, 1, 33, 1, 1.01
> 4160, 65, 1, 0, 1.077
> 4160, 65, 1, 1, 1.077
> 4160, 1, 65, 0, 0.935
> 4160, 1, 65, 1, 0.935
> 4160, 2081, 1, 0, 1.077
> 4160, 2081, 1, 1, 1.077
> 4160, 2049, 33, 0, 1.007
> 4160, 2049, 33, 1, 1.007
> 4224, 0, 0, 0, 1.014
> 4224, 0, 0, 1, 1.014
> 4224, 34, 0, 0, 1.0
> 4224, 34, 0, 1, 1.0
> 4224, 66, 0, 0, 1.001
> 4224, 66, 0, 1, 1.001
> 4224, 0, 34, 0, 0.928
> 4224, 0, 34, 1, 0.928
> 4224, 0, 66, 0, 0.762
> 4224, 0, 66, 1, 0.762
> 4224, 34, 34, 0, 0.998
> 4224, 34, 34, 1, 0.998
> 4224, 66, 66, 0, 0.959
> 4224, 66, 66, 1, 0.959
> 4224, 2048, 0, 0, 1.014
> 4224, 2048, 0, 1, 1.014
> 4224, 2082, 0, 0, 1.001
> 4224, 2082, 0, 1, 1.001
> 4224, 2048, 34, 0, 0.899
> 4224, 2048, 34, 1, 0.898
> 4224, 2082, 34, 0, 0.998
> 4224, 2082, 34, 1, 0.998
> 4224, 34, 1, 0, 1.024
> 4224, 34, 1, 1, 1.023
> 4224, 1, 34, 0, 0.917
> 4224, 1, 34, 1, 0.917
> 4224, 66, 1, 0, 1.012
> 4224, 66, 1, 1, 1.013
> 4224, 1, 66, 0, 0.917
> 4224, 1, 66, 1, 0.917
> 4224, 2082, 1, 0, 1.022
> 4224, 2082, 1, 1, 1.022
> 4224, 2049, 34, 0, 0.914
> 4224, 2049, 34, 1, 0.914
> 4288, 0, 0, 0, 0.999
> 4288, 0, 0, 1, 0.999
> 4288, 35, 0, 0, 0.995
> 4288, 35, 0, 1, 0.996
> 4288, 67, 0, 0, 0.998
> 4288, 67, 0, 1, 0.998
> 4288, 0, 35, 0, 0.919
> 4288, 0, 35, 1, 0.918
> 4288, 0, 67, 0, 0.767
> 4288, 0, 67, 1, 0.767
> 4288, 35, 35, 0, 1.005
> 4288, 35, 35, 1, 1.004
> 4288, 67, 67, 0, 0.995
> 4288, 67, 67, 1, 0.995
> 4288, 2048, 0, 0, 0.999
> 4288, 2048, 0, 1, 0.999
> 4288, 2083, 0, 0, 0.995
> 4288, 2083, 0, 1, 0.995
> 4288, 2048, 35, 0, 0.905
> 4288, 2048, 35, 1, 0.904
> 4288, 2083, 35, 0, 1.005
> 4288, 2083, 35, 1, 1.004
> 4288, 35, 1, 0, 1.033
> 4288, 35, 1, 1, 1.032
> 4288, 1, 35, 0, 0.928
> 4288, 1, 35, 1, 0.928
> 4288, 67, 1, 0, 1.019
> 4288, 67, 1, 1, 1.02
> 4288, 1, 67, 0, 0.925
> 4288, 1, 67, 1, 0.924
> 4288, 2083, 1, 0, 1.03
> 4288, 2083, 1, 1, 1.03
> 4288, 2049, 35, 0, 0.925
> 4288, 2049, 35, 1, 0.926
> 4352, 0, 0, 0, 1.005
> 4352, 0, 0, 1, 1.005
> 4352, 36, 0, 0, 1.007
> 4352, 36, 0, 1, 1.006
> 4352, 68, 0, 0, 1.007
> 4352, 68, 0, 1, 1.008
> 4352, 0, 36, 0, 0.929
> 4352, 0, 36, 1, 0.929
> 4352, 0, 68, 0, 0.766
> 4352, 0, 68, 1, 0.766
> 4352, 36, 36, 0, 0.998
> 4352, 36, 36, 1, 0.998
> 4352, 68, 68, 0, 0.964
> 4352, 68, 68, 1, 0.964
> 4352, 2048, 0, 0, 1.006
> 4352, 2048, 0, 1, 1.006
> 4352, 2084, 0, 0, 1.006
> 4352, 2084, 0, 1, 1.006
> 4352, 2048, 36, 0, 0.897
> 4352, 2048, 36, 1, 0.898
> 4352, 2084, 36, 0, 0.998
> 4352, 2084, 36, 1, 0.998
> 4352, 36, 1, 0, 1.031
> 4352, 36, 1, 1, 1.031
> 4352, 1, 36, 0, 0.924
> 4352, 1, 36, 1, 0.924
> 4352, 68, 1, 0, 0.999
> 4352, 68, 1, 1, 0.999
> 4352, 1, 68, 0, 0.922
> 4352, 1, 68, 1, 0.922
> 4352, 2084, 1, 0, 1.03
> 4352, 2084, 1, 1, 1.03
> 4352, 2049, 36, 0, 0.922
> 4352, 2049, 36, 1, 0.922
> 4416, 0, 0, 0, 0.997
> 4416, 0, 0, 1, 0.997
> 4416, 37, 0, 0, 1.002
> 4416, 37, 0, 1, 1.002
> 4416, 69, 0, 0, 1.004
> 4416, 69, 0, 1, 1.004
> 4416, 0, 37, 0, 0.928
> 4416, 0, 37, 1, 0.927
> 4416, 0, 69, 0, 0.762
> 4416, 0, 69, 1, 0.762
> 4416, 37, 37, 0, 0.994
> 4416, 37, 37, 1, 0.994
> 4416, 69, 69, 0, 0.959
> 4416, 69, 69, 1, 0.959
> 4416, 2048, 0, 0, 0.997
> 4416, 2048, 0, 1, 0.997
> 4416, 2085, 0, 0, 1.001
> 4416, 2085, 0, 1, 1.001
> 4416, 2048, 37, 0, 0.899
> 4416, 2048, 37, 1, 0.899
> 4416, 2085, 37, 0, 0.994
> 4416, 2085, 37, 1, 0.994
> 4416, 37, 1, 0, 1.024
> 4416, 37, 1, 1, 1.023
> 4416, 1, 37, 0, 0.923
> 4416, 1, 37, 1, 0.922
> 4416, 69, 1, 0, 1.009
> 4416, 69, 1, 1, 1.01
> 4416, 1, 69, 0, 0.917
> 4416, 1, 69, 1, 0.917
> 4416, 2085, 1, 0, 1.024
> 4416, 2085, 1, 1, 1.024
> 4416, 2049, 37, 0, 0.919
> 4416, 2049, 37, 1, 0.919
> 4480, 0, 0, 0, 1.0
> 4480, 0, 0, 1, 0.999
> 4480, 38, 0, 0, 0.996
> 4480, 38, 0, 1, 0.996
> 4480, 70, 0, 0, 1.0
> 4480, 70, 0, 1, 1.0
> 4480, 0, 38, 0, 0.919
> 4480, 0, 38, 1, 0.921
> 4480, 0, 70, 0, 0.767
> 4480, 0, 70, 1, 0.767
> 4480, 38, 38, 0, 1.002
> 4480, 38, 38, 1, 1.002
> 4480, 70, 70, 0, 0.963
> 4480, 70, 70, 1, 0.963
> 4480, 2048, 0, 0, 0.998
> 4480, 2048, 0, 1, 0.999
> 4480, 2086, 0, 0, 0.996
> 4480, 2086, 0, 1, 0.995
> 4480, 2048, 38, 0, 0.907
> 4480, 2048, 38, 1, 0.907
> 4480, 2086, 38, 0, 1.002
> 4480, 2086, 38, 1, 1.002
> 4480, 38, 1, 0, 1.032
> 4480, 38, 1, 1, 1.031
> 4480, 1, 38, 0, 0.919
> 4480, 1, 38, 1, 0.92
> 4480, 70, 1, 0, 1.018
> 4480, 70, 1, 1, 1.017
> 4480, 1, 70, 0, 0.916
> 4480, 1, 70, 1, 0.915
> 4480, 2086, 1, 0, 1.031
> 4480, 2086, 1, 1, 1.03
> 4480, 2049, 38, 0, 0.917
> 4480, 2049, 38, 1, 0.918
> 4544, 0, 0, 0, 1.002
> 4544, 0, 0, 1, 1.002
> 4544, 39, 0, 0, 1.007
> 4544, 39, 0, 1, 1.008
> 4544, 71, 0, 0, 1.002
> 4544, 71, 0, 1, 1.002
> 4544, 0, 39, 0, 0.93
> 4544, 0, 39, 1, 0.931
> 4544, 0, 71, 0, 0.766
> 4544, 0, 71, 1, 0.766
> 4544, 39, 39, 0, 1.001
> 4544, 39, 39, 1, 1.001
> 4544, 71, 71, 0, 0.966
> 4544, 71, 71, 1, 0.966
> 4544, 2048, 0, 0, 1.002
> 4544, 2048, 0, 1, 1.002
> 4544, 2087, 0, 0, 1.008
> 4544, 2087, 0, 1, 1.007
> 4544, 2048, 39, 0, 0.901
> 4544, 2048, 39, 1, 0.901
> 4544, 2087, 39, 0, 1.001
> 4544, 2087, 39, 1, 1.001
> 4544, 39, 1, 0, 1.025
> 4544, 39, 1, 1, 1.025
> 4544, 1, 39, 0, 0.919
> 4544, 1, 39, 1, 0.919
> 4544, 71, 1, 0, 0.991
> 4544, 71, 1, 1, 0.991
> 4544, 1, 71, 0, 0.921
> 4544, 1, 71, 1, 0.922
> 4544, 2087, 1, 0, 1.025
> 4544, 2087, 1, 1, 1.025
> 4544, 2049, 39, 0, 0.917
> 4544, 2049, 39, 1, 0.917
> 4608, 0, 0, 0, 0.997
> 4608, 0, 0, 1, 0.997
> 4608, 40, 0, 0, 1.013
> 4608, 40, 0, 1, 1.013
> 4608, 72, 0, 0, 1.013
> 4608, 72, 0, 1, 1.013
> 4608, 0, 40, 0, 0.925
> 4608, 0, 40, 1, 0.926
> 4608, 0, 72, 0, 0.765
> 4608, 0, 72, 1, 0.765
> 4608, 40, 40, 0, 1.084
> 4608, 40, 40, 1, 1.084
> 4608, 72, 72, 0, 0.966
> 4608, 72, 72, 1, 0.966
> 4608, 2048, 0, 0, 0.999
> 4608, 2048, 0, 1, 0.999
> 4608, 2088, 0, 0, 1.012
> 4608, 2088, 0, 1, 1.012
> 4608, 2048, 40, 0, 0.898
> 4608, 2048, 40, 1, 0.898
> 4608, 2088, 40, 0, 1.087
> 4608, 2088, 40, 1, 1.087
> 4608, 40, 1, 0, 1.006
> 4608, 40, 1, 1, 1.006
> 4608, 1, 40, 0, 0.926
> 4608, 1, 40, 1, 0.925
> 4608, 72, 1, 0, 1.012
> 4608, 72, 1, 1, 1.011
> 4608, 1, 72, 0, 0.92
> 4608, 1, 72, 1, 0.92
> 4608, 2088, 1, 0, 1.006
> 4608, 2088, 1, 1, 1.006
> 4608, 2049, 40, 0, 0.923
> 4608, 2049, 40, 1, 0.923
> 4672, 0, 0, 0, 1.014
> 4672, 0, 0, 1, 1.014
> 4672, 41, 0, 0, 1.003
> 4672, 41, 0, 1, 1.003
> 4672, 73, 0, 0, 0.983
> 4672, 73, 0, 1, 0.982
> 4672, 0, 41, 0, 0.916
> 4672, 0, 41, 1, 0.918
> 4672, 0, 73, 0, 0.772
> 4672, 0, 73, 1, 0.772
> 4672, 41, 41, 0, 1.012
> 4672, 41, 41, 1, 1.012
> 4672, 73, 73, 0, 0.973
> 4672, 73, 73, 1, 0.973
> 4672, 2048, 0, 0, 1.014
> 4672, 2048, 0, 1, 1.014
> 4672, 2089, 0, 0, 1.002
> 4672, 2089, 0, 1, 1.002
> 4672, 2048, 41, 0, 0.907
> 4672, 2048, 41, 1, 0.908
> 4672, 2089, 41, 0, 1.012
> 4672, 2089, 41, 1, 1.012
> 4672, 41, 1, 0, 1.027
> 4672, 41, 1, 1, 1.027
> 4672, 1, 41, 0, 0.928
> 4672, 1, 41, 1, 0.927
> 4672, 73, 1, 0, 1.032
> 4672, 73, 1, 1, 1.03
> 4672, 1, 73, 0, 0.927
> 4672, 1, 73, 1, 0.927
> 4672, 2089, 1, 0, 1.026
> 4672, 2089, 1, 1, 1.027
> 4672, 2049, 41, 0, 0.925
> 4672, 2049, 41, 1, 0.925
> 4736, 0, 0, 0, 1.005
> 4736, 0, 0, 1, 1.005
> 4736, 42, 0, 0, 1.012
> 4736, 42, 0, 1, 1.012
> 4736, 74, 0, 0, 0.976
> 4736, 74, 0, 1, 0.975
> 4736, 0, 42, 0, 0.93
> 4736, 0, 42, 1, 0.93
> 4736, 0, 74, 0, 0.77
> 4736, 0, 74, 1, 0.77
> 4736, 42, 42, 0, 1.007
> 4736, 42, 42, 1, 1.007
> 4736, 74, 74, 0, 0.965
> 4736, 74, 74, 1, 0.965
> 4736, 2048, 0, 0, 1.006
> 4736, 2048, 0, 1, 1.006
> 4736, 2090, 0, 0, 1.013
> 4736, 2090, 0, 1, 1.013
> 4736, 2048, 42, 0, 0.902
> 4736, 2048, 42, 1, 0.902
> 4736, 2090, 42, 0, 1.007
> 4736, 2090, 42, 1, 1.007
> 4736, 42, 1, 0, 1.032
> 4736, 42, 1, 1, 1.032
> 4736, 1, 42, 0, 0.925
> 4736, 1, 42, 1, 0.925
> 4736, 74, 1, 0, 1.018
> 4736, 74, 1, 1, 1.018
> 4736, 1, 74, 0, 0.912
> 4736, 1, 74, 1, 0.912
> 4736, 2090, 1, 0, 1.032
> 4736, 2090, 1, 1, 1.032
> 4736, 2049, 42, 0, 0.923
> 4736, 2049, 42, 1, 0.923
> 4800, 0, 0, 0, 1.012
> 4800, 0, 0, 1, 1.012
> 4800, 43, 0, 0, 1.008
> 4800, 43, 0, 1, 1.008
> 4800, 75, 0, 0, 0.99
> 4800, 75, 0, 1, 0.99
> 4800, 0, 43, 0, 0.928
> 4800, 0, 43, 1, 0.928
> 4800, 0, 75, 0, 0.767
> 4800, 0, 75, 1, 0.768
> 4800, 43, 43, 0, 1.004
> 4800, 43, 43, 1, 1.004
> 4800, 75, 75, 0, 0.965
> 4800, 75, 75, 1, 0.965
> 4800, 2048, 0, 0, 1.012
> 4800, 2048, 0, 1, 1.012
> 4800, 2091, 0, 0, 1.009
> 4800, 2091, 0, 1, 1.008
> 4800, 2048, 43, 0, 0.902
> 4800, 2048, 43, 1, 0.902
> 4800, 2091, 43, 0, 1.004
> 4800, 2091, 43, 1, 1.004
> 4800, 43, 1, 0, 1.026
> 4800, 43, 1, 1, 1.025
> 4800, 1, 43, 0, 0.91
> 4800, 1, 43, 1, 0.91
> 4800, 75, 1, 0, 0.992
> 4800, 75, 1, 1, 0.992
> 4800, 1, 75, 0, 0.921
> 4800, 1, 75, 1, 0.92
> 4800, 2091, 1, 0, 1.025
> 4800, 2091, 1, 1, 1.025
> 4800, 2049, 43, 0, 0.907
> 4800, 2049, 43, 1, 0.907
> 4864, 0, 0, 0, 0.998
> 4864, 0, 0, 1, 0.998
> 4864, 44, 0, 0, 1.003
> 4864, 44, 0, 1, 1.004
> 4864, 76, 0, 0, 0.987
> 4864, 76, 0, 1, 0.987
> 4864, 0, 44, 0, 0.92
> 4864, 0, 44, 1, 0.921
> 4864, 0, 76, 0, 0.933
> 4864, 0, 76, 1, 0.932
> 4864, 44, 44, 0, 1.006
> 4864, 44, 44, 1, 1.004
> 4864, 76, 76, 0, 0.976
> 4864, 76, 76, 1, 0.975
> 4864, 2048, 0, 0, 0.999
> 4864, 2048, 0, 1, 0.999
> 4864, 2092, 0, 0, 1.004
> 4864, 2092, 0, 1, 1.005
> 4864, 2048, 44, 0, 0.907
> 4864, 2048, 44, 1, 0.907
> 4864, 2092, 44, 0, 1.006
> 4864, 2092, 44, 1, 1.005
> 4864, 44, 1, 0, 1.034
> 4864, 44, 1, 1, 1.032
> 4864, 1, 44, 0, 0.908
> 4864, 1, 44, 1, 0.929
> 4864, 76, 1, 0, 1.006
> 4864, 76, 1, 1, 1.005
> 4864, 1, 76, 0, 0.798
> 4864, 1, 76, 1, 0.798
> 4864, 2092, 1, 0, 1.033
> 4864, 2092, 1, 1, 1.033
> 4864, 2049, 44, 0, 0.904
> 4864, 2049, 44, 1, 0.925
> 4928, 0, 0, 0, 1.005
> 4928, 0, 0, 1, 1.005
> 4928, 45, 0, 0, 0.993
> 4928, 45, 0, 1, 1.012
> 4928, 77, 0, 0, 0.956
> 4928, 77, 0, 1, 0.976
> 4928, 0, 45, 0, 0.933
> 4928, 0, 45, 1, 0.932
> 4928, 0, 77, 0, 0.771
> 4928, 0, 77, 1, 0.771
> 4928, 45, 45, 0, 1.015
> 4928, 45, 45, 1, 1.015
> 4928, 77, 77, 0, 0.972
> 4928, 77, 77, 1, 0.972
> 4928, 2048, 0, 0, 1.005
> 4928, 2048, 0, 1, 1.005
> 4928, 2093, 0, 0, 0.992
> 4928, 2093, 0, 1, 1.012
> 4928, 2048, 45, 0, 0.932
> 4928, 2048, 45, 1, 0.931
> 4928, 2093, 45, 0, 1.015
> 4928, 2093, 45, 1, 1.015
> 4928, 45, 1, 0, 1.009
> 4928, 45, 1, 1, 1.032
> 4928, 1, 45, 0, 0.806
> 4928, 1, 45, 1, 0.805
> 4928, 77, 1, 0, 0.981
> 4928, 77, 1, 1, 1.005
> 4928, 1, 77, 0, 0.917
> 4928, 1, 77, 1, 0.917
> 4928, 2093, 1, 0, 1.008
> 4928, 2093, 1, 1, 1.032
> 4928, 2049, 45, 0, 0.794
> 4928, 2049, 45, 1, 0.794
> 4992, 0, 0, 0, 0.999
> 4992, 0, 0, 1, 0.999
> 4992, 46, 0, 0, 0.985
> 4992, 46, 0, 1, 1.008
> 4992, 78, 0, 0, 0.963
> 4992, 78, 0, 1, 0.984
> 4992, 0, 46, 0, 0.908
> 4992, 0, 46, 1, 0.908
> 4992, 0, 78, 0, 0.752
> 4992, 0, 78, 1, 0.751
> 4992, 46, 46, 0, 0.997
> 4992, 46, 46, 1, 0.997
> 4992, 78, 78, 0, 0.969
> 4992, 78, 78, 1, 0.968
> 4992, 2048, 0, 0, 1.0
> 4992, 2048, 0, 1, 1.0
> 4992, 2094, 0, 0, 0.987
> 4992, 2094, 0, 1, 1.008
> 4992, 2048, 46, 0, 0.883
> 4992, 2048, 46, 1, 0.883
> 4992, 2094, 46, 0, 0.997
> 4992, 2094, 46, 1, 0.997
> 4992, 46, 1, 0, 0.998
> 4992, 46, 1, 1, 1.02
> 4992, 1, 46, 0, 0.917
> 4992, 1, 46, 1, 0.917
> 4992, 78, 1, 0, 0.972
> 4992, 78, 1, 1, 0.993
> 4992, 1, 78, 0, 0.919
> 4992, 1, 78, 1, 0.92
> 4992, 2094, 1, 0, 0.997
> 4992, 2094, 1, 1, 1.019
> 4992, 2049, 46, 0, 0.914
> 4992, 2049, 46, 1, 0.914
> 5056, 0, 0, 0, 1.002
> 5056, 0, 0, 1, 1.0
> 5056, 47, 0, 0, 1.005
> 5056, 47, 0, 1, 1.005
> 5056, 79, 0, 0, 0.989
> 5056, 79, 0, 1, 0.989
> 5056, 0, 47, 0, 0.918
> 5056, 0, 47, 1, 0.919
> 5056, 0, 79, 0, 0.772
> 5056, 0, 79, 1, 0.771
> 5056, 47, 47, 0, 1.006
> 5056, 47, 47, 1, 1.006
> 5056, 79, 79, 0, 0.972
> 5056, 79, 79, 1, 0.972
> 5056, 2048, 0, 0, 1.001
> 5056, 2048, 0, 1, 1.0
> 5056, 2095, 0, 0, 1.004
> 5056, 2095, 0, 1, 1.004
> 5056, 2048, 47, 0, 0.908
> 5056, 2048, 47, 1, 0.909
> 5056, 2095, 47, 0, 1.006
> 5056, 2095, 47, 1, 1.006
> 5056, 47, 1, 0, 1.033
> 5056, 47, 1, 1, 1.033
> 5056, 1, 47, 0, 0.919
> 5056, 1, 47, 1, 0.919
> 5056, 79, 1, 0, 1.003
> 5056, 79, 1, 1, 1.005
> 5056, 1, 79, 0, 0.921
> 5056, 1, 79, 1, 0.921
> 5056, 2095, 1, 0, 1.032
> 5056, 2095, 1, 1, 1.034
> 5056, 2049, 47, 0, 0.918
> 5056, 2049, 47, 1, 0.917
> 5120, 0, 0, 0, 1.003
> 5120, 0, 0, 1, 1.003
> 5120, 48, 0, 0, 1.068
> 5120, 48, 0, 1, 1.068
> 5120, 80, 0, 0, 1.068
> 5120, 80, 0, 1, 1.068
> 5120, 0, 48, 0, 1.065
> 5120, 0, 48, 1, 1.065
> 5120, 0, 80, 0, 1.064
> 5120, 0, 80, 1, 1.065
> 5120, 48, 48, 0, 1.004
> 5120, 48, 48, 1, 1.004
> 5120, 80, 80, 0, 1.005
> 5120, 80, 80, 1, 1.005
> 5120, 2048, 0, 0, 1.005
> 5120, 2048, 0, 1, 1.005
> 5120, 2096, 0, 0, 1.068
> 5120, 2096, 0, 1, 1.068
> 5120, 2048, 48, 0, 1.065
> 5120, 2048, 48, 1, 1.065
> 5120, 2096, 48, 0, 1.005
> 5120, 2096, 48, 1, 1.005
> 5120, 48, 1, 0, 1.033
> 5120, 48, 1, 1, 1.031
> 5120, 1, 48, 0, 0.898
> 5120, 1, 48, 1, 0.898
> 5120, 80, 1, 0, 0.844
> 5120, 80, 1, 1, 0.844
> 5120, 1, 80, 0, 0.898
> 5120, 1, 80, 1, 0.898
> 5120, 2096, 1, 0, 0.856
> 5120, 2096, 1, 1, 0.855
> 5120, 2049, 48, 0, 0.898
> 5120, 2049, 48, 1, 0.898
>
> bench-memcpy-random:
>
> length, New Time / Old Time
> 32768, 0.866
> 65536, 0.891
> 131072, 0.896
> 262144, 0.901
> 524288, 0.904
> 1048576, 0.913
>
> bench-memcpy-large:
>
> length, align0, align1, dst>src, New Time/Old Time
> 65543, 0, 0, 0, 0.981
> 65543, 0, 0, 1, 0.981
> 65551, 0, 3, 0, 1.012
> 65551, 0, 3, 1, 1.013
> 65567, 3, 0, 0, 1.019
> 65567, 3, 0, 1, 1.02
> 65599, 3, 5, 0, 1.058
> 65599, 3, 5, 1, 1.061
> 65536, 0, 127, 0, 1.046
> 65536, 0, 127, 1, 1.046
> 65536, 0, 255, 0, 1.071
> 65536, 0, 255, 1, 1.071
> 65536, 0, 256, 0, 0.983
> 65536, 0, 256, 1, 0.984
> 65536, 0, 4064, 0, 1.017
> 65536, 0, 4064, 1, 1.018
> 131079, 0, 0, 0, 0.981
> 131079, 0, 0, 1, 0.981
> 131087, 0, 3, 0, 1.017
> 131087, 0, 3, 1, 1.017
> 131103, 3, 0, 0, 1.022
> 131103, 3, 0, 1, 1.022
> 131135, 3, 5, 0, 1.064
> 131135, 3, 5, 1, 1.065
> 131072, 0, 127, 0, 1.05
> 131072, 0, 127, 1, 1.05
> 131072, 0, 255, 0, 1.074
> 131072, 0, 255, 1, 1.074
> 131072, 0, 256, 0, 0.984
> 131072, 0, 256, 1, 0.984
> 131072, 0, 4064, 0, 1.018
> 131072, 0, 4064, 1, 1.019
> 262151, 0, 0, 0, 0.985
> 262151, 0, 0, 1, 0.985
> 262159, 0, 3, 0, 1.026
> 262159, 0, 3, 1, 1.026
> 262175, 3, 0, 0, 1.03
> 262175, 3, 0, 1, 1.03
> 262207, 3, 5, 0, 1.07
> 262207, 3, 5, 1, 1.07
> 262144, 0, 127, 0, 1.057
> 262144, 0, 127, 1, 1.057
> 262144, 0, 255, 0, 1.079
> 262144, 0, 255, 1, 1.078
> 262144, 0, 256, 0, 0.988
> 262144, 0, 256, 1, 0.988
> 262144, 0, 4064, 0, 1.02
> 262144, 0, 4064, 1, 1.02
> 524295, 0, 0, 0, 0.692
> 524295, 0, 0, 1, 0.692
> 524303, 0, 3, 0, 0.736
> 524303, 0, 3, 1, 0.737
> 524319, 3, 0, 0, 0.758
> 524319, 3, 0, 1, 0.759
> 524351, 3, 5, 0, 0.759
> 524351, 3, 5, 1, 0.759
> 524288, 0, 127, 0, 1.057
> 524288, 0, 127, 1, 1.058
> 524288, 0, 255, 0, 1.079
> 524288, 0, 255, 1, 1.079
> 524288, 0, 256, 0, 0.988
> 524288, 0, 256, 1, 0.988
> 524288, 0, 4064, 0, 1.02
> 524288, 0, 4064, 1, 1.02
> 1048583, 0, 0, 0, 0.948
> 1048583, 0, 0, 1, 0.948
> 1048591, 0, 3, 0, 0.735
> 1048591, 0, 3, 1, 0.735
> 1048607, 3, 0, 0, 0.757
> 1048607, 3, 0, 1, 0.758
> 1048639, 3, 5, 0, 0.758
> 1048639, 3, 5, 1, 0.758
> 1048576, 0, 127, 0, 0.761
> 1048576, 0, 127, 1, 0.762
> 1048576, 0, 255, 0, 0.751
> 1048576, 0, 255, 1, 0.751
> 1048576, 0, 256, 0, 0.93
> 1048576, 0, 256, 1, 0.93
> 1048576, 0, 4064, 0, 0.93
> 1048576, 0, 4064, 1, 0.93
> 2097159, 0, 0, 0, 0.928
> 2097159, 0, 0, 1, 0.931
> 2097167, 0, 3, 0, 0.735
> 2097167, 0, 3, 1, 0.734
> 2097183, 3, 0, 0, 0.759
> 2097183, 3, 0, 1, 0.759
> 2097215, 3, 5, 0, 0.758
> 2097215, 3, 5, 1, 0.757
> 2097152, 0, 127, 0, 0.77
> 2097152, 0, 127, 1, 0.77
> 2097152, 0, 255, 0, 0.745
> 2097152, 0, 255, 1, 0.745
> 2097152, 0, 256, 0, 0.924
> 2097152, 0, 256, 1, 0.925
> 2097152, 0, 4064, 0, 0.926
> 2097152, 0, 4064, 1, 0.927
> 4194311, 0, 0, 0, 0.894
> 4194311, 0, 0, 1, 0.896
> 4194319, 0, 3, 0, 0.752
> 4194319, 0, 3, 1, 0.751
> 4194335, 3, 0, 0, 0.82
> 4194335, 3, 0, 1, 0.821
> 4194367, 3, 5, 0, 0.788
> 4194367, 3, 5, 1, 0.789
> 4194304, 0, 127, 0, 0.801
> 4194304, 0, 127, 1, 0.801
> 4194304, 0, 255, 0, 0.802
> 4194304, 0, 255, 1, 0.804
> 4194304, 0, 256, 0, 0.873
> 4194304, 0, 256, 1, 0.868
> 4194304, 0, 4064, 0, 0.955
> 4194304, 0, 4064, 1, 0.954
> 8388615, 0, 0, 0, 0.885
> 8388615, 0, 0, 1, 0.886
> 8388623, 0, 3, 0, 0.769
> 8388623, 0, 3, 1, 0.769
> 8388639, 3, 0, 0, 0.87
> 8388639, 3, 0, 1, 0.87
> 8388671, 3, 5, 0, 0.811
> 8388671, 3, 5, 1, 0.814
> 8388608, 0, 127, 0, 0.83
> 8388608, 0, 127, 1, 0.83
> 8388608, 0, 255, 0, 0.857
> 8388608, 0, 255, 1, 0.857
> 8388608, 0, 256, 0, 0.851
> 8388608, 0, 256, 1, 0.848
> 8388608, 0, 4064, 0, 0.981
> 8388608, 0, 4064, 1, 0.981
> 16777223, 0, 0, 0, 0.885
> 16777223, 0, 0, 1, 0.886
> 16777231, 0, 3, 0, 0.769
> 16777231, 0, 3, 1, 0.768
> 16777247, 3, 0, 0, 0.87
> 16777247, 3, 0, 1, 0.87
> 16777279, 3, 5, 0, 0.811
> 16777279, 3, 5, 1, 0.814
> 16777216, 0, 127, 0, 0.831
> 16777216, 0, 127, 1, 0.83
> 16777216, 0, 255, 0, 0.857
> 16777216, 0, 255, 1, 0.857
> 16777216, 0, 256, 0, 0.852
> 16777216, 0, 256, 1, 0.848
> 16777216, 0, 4064, 0, 0.98
> 16777216, 0, 4064, 1, 0.981
> 33554439, 0, 0, 0, 0.885
> 33554439, 0, 0, 1, 0.886
> 33554447, 0, 3, 0, 0.768
> 33554447, 0, 3, 1, 0.768
> 33554463, 3, 0, 0, 0.871
> 33554463, 3, 0, 1, 0.87
> 33554495, 3, 5, 0, 0.811
> 33554495, 3, 5, 1, 0.814
> 33554432, 0, 127, 0, 0.831
> 33554432, 0, 127, 1, 0.831
> 33554432, 0, 255, 0, 0.858
> 33554432, 0, 255, 1, 0.857
> 33554432, 0, 256, 0, 0.852
> 33554432, 0, 256, 1, 0.848
> 33554432, 0, 4064, 0, 0.98
> 33554432, 0, 4064, 1, 0.981
>
>
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
> sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
> sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
> sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
> sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
> 6 files changed, 3572 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 2b3c625ea2..5b02ec8de5 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -46,13 +46,11 @@ sysdep_routines += \
> stpcpy-evex \
> stpcpy-sse2 \
> stpcpy-sse2-unaligned \
> - stpcpy-ssse3 \
> stpncpy-avx2 \
> stpncpy-avx2-rtm \
> stpncpy-c \
> stpncpy-evex \
> stpncpy-sse2-unaligned \
> - stpncpy-ssse3 \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> strcasecmp_l-evex \
> @@ -83,7 +81,6 @@ sysdep_routines += \
> strcpy-evex \
> strcpy-sse2 \
> strcpy-sse2-unaligned \
> - strcpy-ssse3 \
> strcspn-c \
> strcspn-sse2 \
> strlen-avx2 \
> @@ -110,7 +107,6 @@ sysdep_routines += \
> strncpy-c \
> strncpy-evex \
> strncpy-sse2-unaligned \
> - strncpy-ssse3 \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 41a04621ad..49ce6860d0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
> __stpncpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpcpy.c. */
> IFUNC_IMPL (i, name, stpcpy,
> - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
> __stpcpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcpy_evex)
> - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> - __strcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncpy_evex)
> - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> - __strncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strncpy, 1,
> __strncpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_ssse3
> -# endif
> -
> - .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> - mov %rsi, %rcx
> -# ifdef USE_AS_STRNCPY
> - mov %RDX_LP, %R8_LP
> -# endif
> - mov %rdi, %rdx
> -# ifdef USE_AS_STRNCPY
> - test %R8_LP, %R8_LP
> - jz L(Exit0)
> - cmp $8, %R8_LP
> - jbe L(StrncpyExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - jb L(StrncpyExit15Bytes)
> -# endif
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - je L(Exit16)
> -# endif
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - mov %rcx, %rsi
> - sub $16, %r8
> - and $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> - add %rsi, %r8
> -# endif
> - lea 16(%rcx), %rsi
> - and $-16, %rsi
> - pxor %xmm0, %xmm0
> - mov (%rcx), %r9
> - mov %r9, (%rdx)
> - pcmpeqb (%rsi), %xmm0
> - mov 8(%rcx), %r9
> - mov %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> - pmovmskb %xmm0, %rax
> - sub %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - mov %rdx, %rax
> - lea 16(%rdx), %rdx
> - and $-16, %rdx
> - sub %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> - add %rax, %rsi
> - lea -1(%rsi), %rsi
> - and $1<<31, %esi
> - test %rsi, %rsi
> - jnz L(ContinueCopy)
> - lea 16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> - sub %rax, %rcx
> - mov %rcx, %rax
> - and $0xf, %rax
> - mov $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> - jz L(Align16Both)
> -
> - cmp $8, %rax
> - jae L(ShlHigh8)
> - cmp $1, %rax
> - je L(Shl1)
> - cmp $2, %rax
> - je L(Shl2)
> - cmp $3, %rax
> - je L(Shl3)
> - cmp $4, %rax
> - je L(Shl4)
> - cmp $5, %rax
> - je L(Shl5)
> - cmp $6, %rax
> - je L(Shl6)
> - jmp L(Shl7)
> -
> -L(ShlHigh8):
> - je L(Shl8)
> - cmp $9, %rax
> - je L(Shl9)
> - cmp $10, %rax
> - je L(Shl10)
> - cmp $11, %rax
> - je L(Shl11)
> - cmp $12, %rax
> - je L(Shl12)
> - cmp $13, %rax
> - je L(Shl13)
> - cmp $14, %rax
> - je L(Shl14)
> - jmp L(Shl15)
> -
> -L(Align16Both):
> - movaps (%rcx), %xmm1
> - movaps 16(%rcx), %xmm2
> - movaps %xmm1, (%rdx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm4
> - movaps %xmm3, (%rdx, %rsi)
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm1
> - movaps %xmm4, (%rdx, %rsi)
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm2
> - movaps %xmm1, (%rdx, %rsi)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm3, (%rdx, %rsi)
> - mov %rcx, %rax
> - lea 16(%rcx, %rsi), %rcx
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - lea 112(%r8, %rax), %r8
> -# endif
> - mov $-0x40, %rsi
> -
> - .p2align 4
> -L(Aligned64Loop):
> - movaps (%rcx), %xmm2
> - movaps %xmm2, %xmm4
> - movaps 16(%rcx), %xmm5
> - movaps 32(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 48(%rcx), %xmm7
> - pminub %xmm5, %xmm2
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rax
> - lea 64(%rdx), %rdx
> - lea 64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeaveCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Aligned64Leave)
> - movaps %xmm4, -64(%rdx)
> - movaps %xmm5, -48(%rdx)
> - movaps %xmm6, -32(%rdx)
> - movaps %xmm7, -16(%rdx)
> - jmp L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> - lea 48(%r8), %r8
> -# endif
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm6, -32(%rdx)
> - pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl1):
> - movaps -1(%rcx), %xmm1
> - movaps 15(%rcx), %xmm2
> -L(Shl1Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 31(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -15(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl1LoopStart):
> - movaps 15(%rcx), %xmm2
> - movaps 31(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 47(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 63(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $1, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $1, %xmm3, %xmm4
> - jnz L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave1)
> -# endif
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> - movdqu -1(%rcx), %xmm1
> - mov $15, %rsi
> - movdqu %xmm1, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl2):
> - movaps -2(%rcx), %xmm1
> - movaps 14(%rcx), %xmm2
> -L(Shl2Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 30(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -14(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl2LoopStart):
> - movaps 14(%rcx), %xmm2
> - movaps 30(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 46(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 62(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $2, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $2, %xmm3, %xmm4
> - jnz L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave2)
> -# endif
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> - movdqu -2(%rcx), %xmm1
> - mov $14, %rsi
> - movdqu %xmm1, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl3):
> - movaps -3(%rcx), %xmm1
> - movaps 13(%rcx), %xmm2
> -L(Shl3Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 29(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -13(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl3LoopStart):
> - movaps 13(%rcx), %xmm2
> - movaps 29(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 45(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 61(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $3, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $3, %xmm3, %xmm4
> - jnz L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave3)
> -# endif
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> - movdqu -3(%rcx), %xmm1
> - mov $13, %rsi
> - movdqu %xmm1, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl4):
> - movaps -4(%rcx), %xmm1
> - movaps 12(%rcx), %xmm2
> -L(Shl4Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 28(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -12(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl4LoopStart):
> - movaps 12(%rcx), %xmm2
> - movaps 28(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 44(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 60(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $4, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $4, %xmm3, %xmm4
> - jnz L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave4)
> -# endif
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> - movdqu -4(%rcx), %xmm1
> - mov $12, %rsi
> - movdqu %xmm1, -4(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl5):
> - movaps -5(%rcx), %xmm1
> - movaps 11(%rcx), %xmm2
> -L(Shl5Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 27(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -11(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl5LoopStart):
> - movaps 11(%rcx), %xmm2
> - movaps 27(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 43(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 59(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $5, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $5, %xmm3, %xmm4
> - jnz L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave5)
> -# endif
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> - movdqu -5(%rcx), %xmm1
> - mov $11, %rsi
> - movdqu %xmm1, -5(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl6):
> - movaps -6(%rcx), %xmm1
> - movaps 10(%rcx), %xmm2
> -L(Shl6Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 26(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -10(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl6LoopStart):
> - movaps 10(%rcx), %xmm2
> - movaps 26(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 42(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 58(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $6, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $6, %xmm3, %xmm4
> - jnz L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave6)
> -# endif
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> - mov (%rcx), %r9
> - mov 6(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 6(%rdx)
> - mov $10, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl7):
> - movaps -7(%rcx), %xmm1
> - movaps 9(%rcx), %xmm2
> -L(Shl7Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 25(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -9(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl7LoopStart):
> - movaps 9(%rcx), %xmm2
> - movaps 25(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 41(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 57(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $7, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $7, %xmm3, %xmm4
> - jnz L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave7)
> -# endif
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> - mov (%rcx), %r9
> - mov 5(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 5(%rdx)
> - mov $9, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl8):
> - movaps -8(%rcx), %xmm1
> - movaps 8(%rcx), %xmm2
> -L(Shl8Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 24(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -8(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl8LoopStart):
> - movaps 8(%rcx), %xmm2
> - movaps 24(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 40(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 56(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $8, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $8, %xmm3, %xmm4
> - jnz L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave8)
> -# endif
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl9):
> - movaps -9(%rcx), %xmm1
> - movaps 7(%rcx), %xmm2
> -L(Shl9Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 23(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -7(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl9LoopStart):
> - movaps 7(%rcx), %xmm2
> - movaps 23(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 39(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 55(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $9, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $9, %xmm3, %xmm4
> - jnz L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave9)
> -# endif
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl10):
> - movaps -10(%rcx), %xmm1
> - movaps 6(%rcx), %xmm2
> -L(Shl10Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 22(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -6(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl10LoopStart):
> - movaps 6(%rcx), %xmm2
> - movaps 22(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 38(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 54(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $10, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $10, %xmm3, %xmm4
> - jnz L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave10)
> -# endif
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl11):
> - movaps -11(%rcx), %xmm1
> - movaps 5(%rcx), %xmm2
> -L(Shl11Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 21(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -5(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl11LoopStart):
> - movaps 5(%rcx), %xmm2
> - movaps 21(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 37(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 53(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $11, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $11, %xmm3, %xmm4
> - jnz L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave11)
> -# endif
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl12):
> - movaps -12(%rcx), %xmm1
> - movaps 4(%rcx), %xmm2
> -L(Shl12Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 20(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -4(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl12LoopStart):
> - movaps 4(%rcx), %xmm2
> - movaps 20(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 36(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 52(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $12, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $12, %xmm3, %xmm4
> - jnz L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave12)
> -# endif
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl13):
> - movaps -13(%rcx), %xmm1
> - movaps 3(%rcx), %xmm2
> -L(Shl13Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 19(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -3(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl13LoopStart):
> - movaps 3(%rcx), %xmm2
> - movaps 19(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 35(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 51(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $13, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $13, %xmm3, %xmm4
> - jnz L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave13)
> -# endif
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl14):
> - movaps -14(%rcx), %xmm1
> - movaps 2(%rcx), %xmm2
> -L(Shl14Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 18(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -2(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl14LoopStart):
> - movaps 2(%rcx), %xmm2
> - movaps 18(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 34(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 50(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $14, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $14, %xmm3, %xmm4
> - jnz L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave14)
> -# endif
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl15):
> - movaps -15(%rcx), %xmm1
> - movaps 1(%rcx), %xmm2
> -L(Shl15Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 17(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -1(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl15LoopStart):
> - movaps 1(%rcx), %xmm2
> - movaps 17(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 33(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 49(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $15, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $15, %xmm3, %xmm4
> - jnz L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave15)
> -# endif
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> - jmp L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> -# ifdef USE_AS_STRNCPY
> - add $16, %r8
> -# endif
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> -
> - .p2align 4
> -L(Exit8):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $8, %r8
> - lea 8(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> -
> - .p2align 4
> -L(Exit16):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %rax
> - mov %rax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - lea 16(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - cmp $1, %r8
> - je L(Exit1)
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - test $0x40, %al
> - jnz L(Exit7)
> - jmp L(Exit8)
> -
> - .p2align 4
> -L(ExitHighCase2):
> - cmp $9, %r8
> - je L(Exit9)
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $15, %r8
> - je L(Exit15)
> - test $0x40, %ah
> - jnz L(Exit15)
> - jmp L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $16, %r8
> - je L(Exit16)
> - cmp $8, %r8
> - je L(Exit8)
> - jg L(More8Case3)
> - cmp $4, %r8
> - je L(Exit4)
> - jg L(More4Case3)
> - cmp $2, %r8
> - jl L(Exit1)
> - je L(Exit2)
> - jg L(Exit3)
> -L(More8Case3): /* but less than 16 */
> - cmp $12, %r8
> - je L(Exit12)
> - jl L(Less12Case3)
> - cmp $14, %r8
> - jl L(Exit13)
> - je L(Exit14)
> - jg L(Exit15)
> -L(More4Case3): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Exit5)
> - je L(Exit6)
> - jg L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Exit9)
> - je L(Exit10)
> - jg L(Exit11)
> -# endif
> -
> - .p2align 4
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea (%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $1, %r8
> - lea 1(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $2, %r8
> - lea 2(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $3, %r8
> - lea 3(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $4, %r8
> - lea 4(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit5):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $5, %r8
> - lea 5(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $6, %r8
> - lea 6(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movl 3(%rcx), %eax
> - movl %eax, 3(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $7, %r8
> - lea 7(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit9):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %eax
> - mov %eax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $9, %r8
> - lea 9(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %eax
> - mov %eax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $10, %r8
> - lea 10(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $11, %r8
> - lea 11(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $12, %r8
> - lea 12(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit13):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %rax
> - mov %rax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $13, %r8
> - lea 13(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %rax
> - mov %rax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $14, %r8
> - lea 14(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $15, %r8
> - lea 15(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(Fill0):
> - ret
> -
> - .p2align 4
> -L(Fill1):
> - movb %dl, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill2):
> - movw %dx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill3):
> - movw %dx, (%rcx)
> - movb %dl, 2(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill4):
> - movl %edx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill5):
> - movl %edx, (%rcx)
> - movb %dl, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill6):
> - movl %edx, (%rcx)
> - movw %dx, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill7):
> - movl %edx, (%rcx)
> - movl %edx, 3(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill8):
> - mov %rdx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill9):
> - mov %rdx, (%rcx)
> - movb %dl, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill10):
> - mov %rdx, (%rcx)
> - movw %dx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill11):
> - mov %rdx, (%rcx)
> - movl %edx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill12):
> - mov %rdx, (%rcx)
> - movl %edx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill13):
> - mov %rdx, (%rcx)
> - mov %rdx, 5(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill14):
> - mov %rdx, (%rcx)
> - mov %rdx, 6(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill15):
> - mov %rdx, (%rcx)
> - mov %rdx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill16):
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(StrncpyFillExit1):
> - lea 16(%r8), %r8
> -L(FillFrom1To16Bytes):
> - test %r8, %r8
> - jz L(Fill0)
> - cmp $16, %r8
> - je L(Fill16)
> - cmp $8, %r8
> - je L(Fill8)
> - jg L(FillMore8)
> - cmp $4, %r8
> - je L(Fill4)
> - jg L(FillMore4)
> - cmp $2, %r8
> - jl L(Fill1)
> - je L(Fill2)
> - jg L(Fill3)
> -L(FillMore8): /* but less than 16 */
> - cmp $12, %r8
> - je L(Fill12)
> - jl L(FillLess12)
> - cmp $14, %r8
> - jl L(Fill13)
> - je L(Fill14)
> - jg L(Fill15)
> -L(FillMore4): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Fill5)
> - je L(Fill6)
> - jg L(Fill7)
> -L(FillLess12): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Fill9)
> - je L(Fill10)
> - jmp L(Fill11)
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero1):
> - xor %rdx, %rdx
> - sub $16, %r8
> - jbe L(StrncpyFillExit1)
> -
> - pxor %xmm0, %xmm0
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> -
> - lea 16(%rcx), %rcx
> -
> - mov %rcx, %rdx
> - and $0xf, %rdx
> - sub %rdx, %rcx
> - add %rdx, %r8
> - xor %rdx, %rdx
> - sub $64, %r8
> - jb L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - movdqa %xmm0, 32(%rcx)
> - movdqa %xmm0, 48(%rcx)
> - lea 64(%rcx), %rcx
> - sub $64, %r8
> - jae L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> - add $32, %r8
> - jl L(StrncpyFillLess32)
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - lea 32(%rcx), %rcx
> - sub $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> - add $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> - .p2align 4
> -L(Exit0):
> - mov %rdx, %rax
> - ret
> -
> - .p2align 4
> -L(StrncpyExit15Bytes):
> - cmp $9, %r8
> - je L(Exit9)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit8Bytes):
> - cmp $1, %r8
> - je L(Exit1)
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> -# endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> - test %rax, %rax
> - jnz L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> - lea 64(%r8), %r8
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - add $48, %r8
> - jle L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> - .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> - movdqu -1(%rcx), %xmm0
> - movdqu %xmm0, -1(%rdx)
> - mov $15, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> - movdqu -2(%rcx), %xmm0
> - movdqu %xmm0, -2(%rdx)
> - mov $14, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> - movdqu -3(%rcx), %xmm0
> - movdqu %xmm0, -3(%rdx)
> - mov $13, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> - movdqu -4(%rcx), %xmm0
> - movdqu %xmm0, -4(%rdx)
> - mov $12, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> - movdqu -5(%rcx), %xmm0
> - movdqu %xmm0, -5(%rdx)
> - mov $11, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 6(%rcx), %r9d
> - mov %r9d, 6(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $10, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 5(%rcx), %r9d
> - mov %r9d, 5(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $9, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave1):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit1)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit1):
> - lea 15(%rdx, %rsi), %rdx
> - lea 15(%rcx, %rsi), %rcx
> - mov -15(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -15(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave2):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit2)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit2):
> - lea 14(%rdx, %rsi), %rdx
> - lea 14(%rcx, %rsi), %rcx
> - mov -14(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -14(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave3):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit3)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit3):
> - lea 13(%rdx, %rsi), %rdx
> - lea 13(%rcx, %rsi), %rcx
> - mov -13(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -13(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave4):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit4)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit4):
> - lea 12(%rdx, %rsi), %rdx
> - lea 12(%rcx, %rsi), %rcx
> - mov -12(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -12(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave5):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit5)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit5):
> - lea 11(%rdx, %rsi), %rdx
> - lea 11(%rcx, %rsi), %rcx
> - mov -11(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -11(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave6):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit6)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit6):
> - lea 10(%rdx, %rsi), %rdx
> - lea 10(%rcx, %rsi), %rcx
> - mov -10(%rcx), %rsi
> - movw -2(%rcx), %ax
> - mov %rsi, -10(%rdx)
> - movw %ax, -2(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave7):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit7)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit7):
> - lea 9(%rdx, %rsi), %rdx
> - lea 9(%rcx, %rsi), %rcx
> - mov -9(%rcx), %rsi
> - movb -1(%rcx), %ah
> - mov %rsi, -9(%rdx)
> - movb %ah, -1(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave8):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit8)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit8):
> - lea 8(%rdx, %rsi), %rdx
> - lea 8(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave9):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit9)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit9):
> - lea 7(%rdx, %rsi), %rdx
> - lea 7(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave10):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit10)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit10):
> - lea 6(%rdx, %rsi), %rdx
> - lea 6(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave11):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit11)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit11):
> - lea 5(%rdx, %rsi), %rdx
> - lea 5(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave12):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit12)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit12):
> - lea 4(%rdx, %rsi), %rdx
> - lea 4(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave13):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit13)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit13):
> - lea 3(%rdx, %rsi), %rdx
> - lea 3(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave14):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit14)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit14):
> - lea 2(%rdx, %rsi), %rdx
> - lea 2(%rcx, %rsi), %rcx
> - movw -2(%rcx), %ax
> - xor %rsi, %rsi
> - movw %ax, -2(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave15):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit15)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit15):
> - lea 1(%rdx, %rsi), %rdx
> - lea 1(%rcx, %rsi), %rcx
> - movb -1(%rcx), %ah
> - xor %rsi, %rsi
> - movb %ah, -1(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back
2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
> sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +-
> sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
> sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
> 5 files changed, 6 insertions(+), 3212 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5b02ec8de5..303fb5d734 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,6 @@ sysdep_routines += \
> memcmpeq-evex \
> memcmpeq-sse2 \
> memcpy-ssse3 \
> - memcpy-ssse3-back \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> @@ -25,7 +24,6 @@ sysdep_routines += \
> memmove-evex-unaligned-erms \
> memmove-sse2-unaligned-erms \
> memmove-ssse3 \
> - memmove-ssse3-back \
> memrchr-avx2 \
> memrchr-avx2-rtm \
> memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 49ce6860d0..c6008a73ed 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memmove_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memmove_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memmove_chk_ssse3)
> @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_avx512_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> - __memmove_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> __memmove_ssse3)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memcpy_chk_ssse3)
> @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> __memcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, memcpy,
> @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_chk_ssse3)
> @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, mempcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..fb01fbb301 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
> attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
> attribute_hidden;
> @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
> }
> }
>
> - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> - return OPTIMIZE (sse2_unaligned_erms);
> -
> - return OPTIMIZE (sse2_unaligned);
> + return OPTIMIZE (ssse3);
> }
>
> - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> - return OPTIMIZE (ssse3_back);
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE (sse2_unaligned_erms);
>
> - return OPTIMIZE (ssse3);
> + return OPTIMIZE (sse2_unaligned);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> - Copyright (C) 2010-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY __memcpy_ssse3_back
> -# define MEMCPY_CHK __memcpy_chk_ssse3_back
> -# define MEMPCPY __mempcpy_ssse3_back
> -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B) I - B
> -
> -/* Branch to an entry in a jump table. TABLE is a jump table with
> - relative offsets. INDEX is a register contains the index into the
> - jump table. SCALE is the scale of INDEX. */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), INDEX; \
> - lea (%r11, INDEX), INDEX; \
> - _CET_NOTRACK jmp *INDEX; \
> - ud2
> -
> - .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> - mov %RDI_LP, %RAX_LP
> - add %RDX_LP, %RAX_LP
> - jmp L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> - mov %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> - add %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> - cmp %rsi, %rdi
> - jb L(copy_forward)
> - je L(bwd_write_0bytes)
> - cmp $144, %rdx
> - jae L(copy_backward)
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> - cmp $144, %rdx
> - jae L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jbe L(bk_write)
> -#endif
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> - .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jle L(copy_backward)
> -#endif
> - movdqu (%rsi), %xmm0
> - mov %rdi, %r8
> - and $-16, %rdi
> - add $16, %rdi
> - mov %rdi, %r9
> - sub %r8, %r9
> - sub %r9, %rdx
> - add %r9, %rsi
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - cmp %rcx, %rdx
> - jae L(gobble_mem_fwd)
> - lea L(shl_table_fwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - shl $1, %rcx
> - cmp %rcx, %rdx
> - ja L(gobble_mem_bwd)
> -
> - add %rdx, %rdi
> - add %rdx, %rsi
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $0xf, %r9
> - xor %r9, %rdi
> - sub %r9, %rsi
> - sub %r9, %rdx
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0_bwd)
> - lea L(shl_table_bwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(shl_0):
> -
> - mov %rdx, %r9
> - shr $8, %r9
> - add %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> - cmp $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> - jae L(gobble_mem_fwd)
> - sub $0x80, %rdx
> - .p2align 4
> -L(shl_0_loop):
> - movdqa (%rsi), %xmm1
> - movdqa %xmm1, (%rdi)
> - movaps 0x10(%rsi), %xmm2
> - movaps %xmm2, 0x10(%rdi)
> - movaps 0x20(%rsi), %xmm3
> - movaps %xmm3, 0x20(%rdi)
> - movaps 0x30(%rsi), %xmm4
> - movaps %xmm4, 0x30(%rdi)
> - movaps 0x40(%rsi), %xmm1
> - movaps %xmm1, 0x40(%rdi)
> - movaps 0x50(%rsi), %xmm2
> - movaps %xmm2, 0x50(%rdi)
> - movaps 0x60(%rsi), %xmm3
> - movaps %xmm3, 0x60(%rdi)
> - movaps 0x70(%rsi), %xmm4
> - movaps %xmm4, 0x70(%rdi)
> - sub $0x80, %rdx
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(shl_0_loop)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_bwd):
> - sub $0x80, %rdx
> -L(copy_backward_loop):
> - movaps -0x10(%rsi), %xmm1
> - movaps %xmm1, -0x10(%rdi)
> - movaps -0x20(%rsi), %xmm2
> - movaps %xmm2, -0x20(%rdi)
> - movaps -0x30(%rsi), %xmm3
> - movaps %xmm3, -0x30(%rdi)
> - movaps -0x40(%rsi), %xmm4
> - movaps %xmm4, -0x40(%rdi)
> - movaps -0x50(%rsi), %xmm5
> - movaps %xmm5, -0x50(%rdi)
> - movaps -0x60(%rsi), %xmm5
> - movaps %xmm5, -0x60(%rdi)
> - movaps -0x70(%rsi), %xmm5
> - movaps %xmm5, -0x70(%rdi)
> - movaps -0x80(%rsi), %xmm5
> - movaps %xmm5, -0x80(%rdi)
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(copy_backward_loop)
> -
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1):
> - sub $0x80, %rdx
> - movaps -0x01(%rsi), %xmm1
> - movaps 0x0f(%rsi), %xmm2
> - movaps 0x1f(%rsi), %xmm3
> - movaps 0x2f(%rsi), %xmm4
> - movaps 0x3f(%rsi), %xmm5
> - movaps 0x4f(%rsi), %xmm6
> - movaps 0x5f(%rsi), %xmm7
> - movaps 0x6f(%rsi), %xmm8
> - movaps 0x7f(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $1, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $1, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $1, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $1, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $1, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $1, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $1, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_1)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1_bwd):
> - movaps -0x01(%rsi), %xmm1
> -
> - movaps -0x11(%rsi), %xmm2
> - palignr $1, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x21(%rsi), %xmm3
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x31(%rsi), %xmm4
> - palignr $1, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x41(%rsi), %xmm5
> - palignr $1, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x51(%rsi), %xmm6
> - palignr $1, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x61(%rsi), %xmm7
> - palignr $1, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x71(%rsi), %xmm8
> - palignr $1, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x81(%rsi), %xmm9
> - palignr $1, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_1_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2):
> - sub $0x80, %rdx
> - movaps -0x02(%rsi), %xmm1
> - movaps 0x0e(%rsi), %xmm2
> - movaps 0x1e(%rsi), %xmm3
> - movaps 0x2e(%rsi), %xmm4
> - movaps 0x3e(%rsi), %xmm5
> - movaps 0x4e(%rsi), %xmm6
> - movaps 0x5e(%rsi), %xmm7
> - movaps 0x6e(%rsi), %xmm8
> - movaps 0x7e(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $2, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $2, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $2, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $2, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $2, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $2, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $2, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_2)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2_bwd):
> - movaps -0x02(%rsi), %xmm1
> -
> - movaps -0x12(%rsi), %xmm2
> - palignr $2, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x22(%rsi), %xmm3
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x32(%rsi), %xmm4
> - palignr $2, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x42(%rsi), %xmm5
> - palignr $2, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x52(%rsi), %xmm6
> - palignr $2, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x62(%rsi), %xmm7
> - palignr $2, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x72(%rsi), %xmm8
> - palignr $2, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x82(%rsi), %xmm9
> - palignr $2, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_2_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3):
> - sub $0x80, %rdx
> - movaps -0x03(%rsi), %xmm1
> - movaps 0x0d(%rsi), %xmm2
> - movaps 0x1d(%rsi), %xmm3
> - movaps 0x2d(%rsi), %xmm4
> - movaps 0x3d(%rsi), %xmm5
> - movaps 0x4d(%rsi), %xmm6
> - movaps 0x5d(%rsi), %xmm7
> - movaps 0x6d(%rsi), %xmm8
> - movaps 0x7d(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $3, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $3, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $3, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $3, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $3, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $3, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $3, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_3)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3_bwd):
> - movaps -0x03(%rsi), %xmm1
> -
> - movaps -0x13(%rsi), %xmm2
> - palignr $3, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x23(%rsi), %xmm3
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x33(%rsi), %xmm4
> - palignr $3, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x43(%rsi), %xmm5
> - palignr $3, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x53(%rsi), %xmm6
> - palignr $3, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x63(%rsi), %xmm7
> - palignr $3, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x73(%rsi), %xmm8
> - palignr $3, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x83(%rsi), %xmm9
> - palignr $3, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_3_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4):
> - sub $0x80, %rdx
> - movaps -0x04(%rsi), %xmm1
> - movaps 0x0c(%rsi), %xmm2
> - movaps 0x1c(%rsi), %xmm3
> - movaps 0x2c(%rsi), %xmm4
> - movaps 0x3c(%rsi), %xmm5
> - movaps 0x4c(%rsi), %xmm6
> - movaps 0x5c(%rsi), %xmm7
> - movaps 0x6c(%rsi), %xmm8
> - movaps 0x7c(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $4, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $4, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $4, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $4, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $4, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $4, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $4, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_4)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4_bwd):
> - movaps -0x04(%rsi), %xmm1
> -
> - movaps -0x14(%rsi), %xmm2
> - palignr $4, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x24(%rsi), %xmm3
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x34(%rsi), %xmm4
> - palignr $4, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x44(%rsi), %xmm5
> - palignr $4, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x54(%rsi), %xmm6
> - palignr $4, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x64(%rsi), %xmm7
> - palignr $4, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x74(%rsi), %xmm8
> - palignr $4, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x84(%rsi), %xmm9
> - palignr $4, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_4_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5):
> - sub $0x80, %rdx
> - movaps -0x05(%rsi), %xmm1
> - movaps 0x0b(%rsi), %xmm2
> - movaps 0x1b(%rsi), %xmm3
> - movaps 0x2b(%rsi), %xmm4
> - movaps 0x3b(%rsi), %xmm5
> - movaps 0x4b(%rsi), %xmm6
> - movaps 0x5b(%rsi), %xmm7
> - movaps 0x6b(%rsi), %xmm8
> - movaps 0x7b(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $5, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $5, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $5, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $5, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $5, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $5, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $5, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_5)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5_bwd):
> - movaps -0x05(%rsi), %xmm1
> -
> - movaps -0x15(%rsi), %xmm2
> - palignr $5, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x25(%rsi), %xmm3
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x35(%rsi), %xmm4
> - palignr $5, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x45(%rsi), %xmm5
> - palignr $5, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x55(%rsi), %xmm6
> - palignr $5, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x65(%rsi), %xmm7
> - palignr $5, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x75(%rsi), %xmm8
> - palignr $5, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x85(%rsi), %xmm9
> - palignr $5, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_5_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6):
> - sub $0x80, %rdx
> - movaps -0x06(%rsi), %xmm1
> - movaps 0x0a(%rsi), %xmm2
> - movaps 0x1a(%rsi), %xmm3
> - movaps 0x2a(%rsi), %xmm4
> - movaps 0x3a(%rsi), %xmm5
> - movaps 0x4a(%rsi), %xmm6
> - movaps 0x5a(%rsi), %xmm7
> - movaps 0x6a(%rsi), %xmm8
> - movaps 0x7a(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $6, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $6, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $6, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $6, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $6, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $6, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $6, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_6)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6_bwd):
> - movaps -0x06(%rsi), %xmm1
> -
> - movaps -0x16(%rsi), %xmm2
> - palignr $6, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x26(%rsi), %xmm3
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x36(%rsi), %xmm4
> - palignr $6, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x46(%rsi), %xmm5
> - palignr $6, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x56(%rsi), %xmm6
> - palignr $6, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x66(%rsi), %xmm7
> - palignr $6, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x76(%rsi), %xmm8
> - palignr $6, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x86(%rsi), %xmm9
> - palignr $6, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_6_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7):
> - sub $0x80, %rdx
> - movaps -0x07(%rsi), %xmm1
> - movaps 0x09(%rsi), %xmm2
> - movaps 0x19(%rsi), %xmm3
> - movaps 0x29(%rsi), %xmm4
> - movaps 0x39(%rsi), %xmm5
> - movaps 0x49(%rsi), %xmm6
> - movaps 0x59(%rsi), %xmm7
> - movaps 0x69(%rsi), %xmm8
> - movaps 0x79(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $7, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $7, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $7, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $7, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $7, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $7, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $7, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_7)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7_bwd):
> - movaps -0x07(%rsi), %xmm1
> -
> - movaps -0x17(%rsi), %xmm2
> - palignr $7, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x27(%rsi), %xmm3
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x37(%rsi), %xmm4
> - palignr $7, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x47(%rsi), %xmm5
> - palignr $7, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x57(%rsi), %xmm6
> - palignr $7, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x67(%rsi), %xmm7
> - palignr $7, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x77(%rsi), %xmm8
> - palignr $7, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x87(%rsi), %xmm9
> - palignr $7, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_7_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8):
> - sub $0x80, %rdx
> - movaps -0x08(%rsi), %xmm1
> - movaps 0x08(%rsi), %xmm2
> - movaps 0x18(%rsi), %xmm3
> - movaps 0x28(%rsi), %xmm4
> - movaps 0x38(%rsi), %xmm5
> - movaps 0x48(%rsi), %xmm6
> - movaps 0x58(%rsi), %xmm7
> - movaps 0x68(%rsi), %xmm8
> - movaps 0x78(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $8, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $8, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $8, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $8, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $8, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $8, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $8, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_8)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8_bwd):
> - movaps -0x08(%rsi), %xmm1
> -
> - movaps -0x18(%rsi), %xmm2
> - palignr $8, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x28(%rsi), %xmm3
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x38(%rsi), %xmm4
> - palignr $8, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x48(%rsi), %xmm5
> - palignr $8, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x58(%rsi), %xmm6
> - palignr $8, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x68(%rsi), %xmm7
> - palignr $8, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x78(%rsi), %xmm8
> - palignr $8, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x88(%rsi), %xmm9
> - palignr $8, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_8_bwd)
> -L(shl_8_end_bwd):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9):
> - sub $0x80, %rdx
> - movaps -0x09(%rsi), %xmm1
> - movaps 0x07(%rsi), %xmm2
> - movaps 0x17(%rsi), %xmm3
> - movaps 0x27(%rsi), %xmm4
> - movaps 0x37(%rsi), %xmm5
> - movaps 0x47(%rsi), %xmm6
> - movaps 0x57(%rsi), %xmm7
> - movaps 0x67(%rsi), %xmm8
> - movaps 0x77(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $9, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $9, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $9, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $9, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $9, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $9, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $9, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_9)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9_bwd):
> - movaps -0x09(%rsi), %xmm1
> -
> - movaps -0x19(%rsi), %xmm2
> - palignr $9, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x29(%rsi), %xmm3
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x39(%rsi), %xmm4
> - palignr $9, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x49(%rsi), %xmm5
> - palignr $9, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x59(%rsi), %xmm6
> - palignr $9, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x69(%rsi), %xmm7
> - palignr $9, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x79(%rsi), %xmm8
> - palignr $9, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x89(%rsi), %xmm9
> - palignr $9, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_9_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10):
> - sub $0x80, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - movaps 0x06(%rsi), %xmm2
> - movaps 0x16(%rsi), %xmm3
> - movaps 0x26(%rsi), %xmm4
> - movaps 0x36(%rsi), %xmm5
> - movaps 0x46(%rsi), %xmm6
> - movaps 0x56(%rsi), %xmm7
> - movaps 0x66(%rsi), %xmm8
> - movaps 0x76(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $10, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $10, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $10, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $10, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $10, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $10, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $10, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_10)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10_bwd):
> - movaps -0x0a(%rsi), %xmm1
> -
> - movaps -0x1a(%rsi), %xmm2
> - palignr $10, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2a(%rsi), %xmm3
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3a(%rsi), %xmm4
> - palignr $10, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4a(%rsi), %xmm5
> - palignr $10, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5a(%rsi), %xmm6
> - palignr $10, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6a(%rsi), %xmm7
> - palignr $10, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7a(%rsi), %xmm8
> - palignr $10, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8a(%rsi), %xmm9
> - palignr $10, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_10_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11):
> - sub $0x80, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - movaps 0x05(%rsi), %xmm2
> - movaps 0x15(%rsi), %xmm3
> - movaps 0x25(%rsi), %xmm4
> - movaps 0x35(%rsi), %xmm5
> - movaps 0x45(%rsi), %xmm6
> - movaps 0x55(%rsi), %xmm7
> - movaps 0x65(%rsi), %xmm8
> - movaps 0x75(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $11, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $11, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $11, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $11, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $11, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $11, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $11, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_11)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11_bwd):
> - movaps -0x0b(%rsi), %xmm1
> -
> - movaps -0x1b(%rsi), %xmm2
> - palignr $11, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2b(%rsi), %xmm3
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3b(%rsi), %xmm4
> - palignr $11, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4b(%rsi), %xmm5
> - palignr $11, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5b(%rsi), %xmm6
> - palignr $11, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6b(%rsi), %xmm7
> - palignr $11, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7b(%rsi), %xmm8
> - palignr $11, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8b(%rsi), %xmm9
> - palignr $11, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_11_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12):
> - sub $0x80, %rdx
> - movdqa -0x0c(%rsi), %xmm1
> - movaps 0x04(%rsi), %xmm2
> - movaps 0x14(%rsi), %xmm3
> - movaps 0x24(%rsi), %xmm4
> - movaps 0x34(%rsi), %xmm5
> - movaps 0x44(%rsi), %xmm6
> - movaps 0x54(%rsi), %xmm7
> - movaps 0x64(%rsi), %xmm8
> - movaps 0x74(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $12, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $12, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $12, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $12, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $12, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $12, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $12, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> -
> - lea 0x80(%rdi), %rdi
> - jae L(shl_12)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12_bwd):
> - movaps -0x0c(%rsi), %xmm1
> -
> - movaps -0x1c(%rsi), %xmm2
> - palignr $12, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2c(%rsi), %xmm3
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3c(%rsi), %xmm4
> - palignr $12, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4c(%rsi), %xmm5
> - palignr $12, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5c(%rsi), %xmm6
> - palignr $12, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6c(%rsi), %xmm7
> - palignr $12, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7c(%rsi), %xmm8
> - palignr $12, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8c(%rsi), %xmm9
> - palignr $12, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_12_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13):
> - sub $0x80, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - movaps 0x03(%rsi), %xmm2
> - movaps 0x13(%rsi), %xmm3
> - movaps 0x23(%rsi), %xmm4
> - movaps 0x33(%rsi), %xmm5
> - movaps 0x43(%rsi), %xmm6
> - movaps 0x53(%rsi), %xmm7
> - movaps 0x63(%rsi), %xmm8
> - movaps 0x73(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $13, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $13, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $13, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $13, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $13, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $13, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $13, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_13)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13_bwd):
> - movaps -0x0d(%rsi), %xmm1
> -
> - movaps -0x1d(%rsi), %xmm2
> - palignr $13, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2d(%rsi), %xmm3
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3d(%rsi), %xmm4
> - palignr $13, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4d(%rsi), %xmm5
> - palignr $13, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5d(%rsi), %xmm6
> - palignr $13, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6d(%rsi), %xmm7
> - palignr $13, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7d(%rsi), %xmm8
> - palignr $13, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8d(%rsi), %xmm9
> - palignr $13, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_13_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14):
> - sub $0x80, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - movaps 0x02(%rsi), %xmm2
> - movaps 0x12(%rsi), %xmm3
> - movaps 0x22(%rsi), %xmm4
> - movaps 0x32(%rsi), %xmm5
> - movaps 0x42(%rsi), %xmm6
> - movaps 0x52(%rsi), %xmm7
> - movaps 0x62(%rsi), %xmm8
> - movaps 0x72(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $14, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $14, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $14, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $14, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $14, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $14, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $14, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_14)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14_bwd):
> - movaps -0x0e(%rsi), %xmm1
> -
> - movaps -0x1e(%rsi), %xmm2
> - palignr $14, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2e(%rsi), %xmm3
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3e(%rsi), %xmm4
> - palignr $14, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4e(%rsi), %xmm5
> - palignr $14, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5e(%rsi), %xmm6
> - palignr $14, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6e(%rsi), %xmm7
> - palignr $14, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7e(%rsi), %xmm8
> - palignr $14, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8e(%rsi), %xmm9
> - palignr $14, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_14_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15):
> - sub $0x80, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - movaps 0x01(%rsi), %xmm2
> - movaps 0x11(%rsi), %xmm3
> - movaps 0x21(%rsi), %xmm4
> - movaps 0x31(%rsi), %xmm5
> - movaps 0x41(%rsi), %xmm6
> - movaps 0x51(%rsi), %xmm7
> - movaps 0x61(%rsi), %xmm8
> - movaps 0x71(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $15, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $15, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $15, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $15, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $15, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $15, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $15, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_15)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15_bwd):
> - movaps -0x0f(%rsi), %xmm1
> -
> - movaps -0x1f(%rsi), %xmm2
> - palignr $15, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2f(%rsi), %xmm3
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3f(%rsi), %xmm4
> - palignr $15, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4f(%rsi), %xmm5
> - palignr $15, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5f(%rsi), %xmm6
> - palignr $15, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6f(%rsi), %xmm7
> - palignr $15, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7f(%rsi), %xmm8
> - palignr $15, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8f(%rsi), %xmm9
> - palignr $15, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_15_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_fwd):
> - movdqu (%rsi), %xmm1
> - movdqu %xmm0, (%r8)
> - movdqa %xmm1, (%rdi)
> - sub $16, %rdx
> - add $16, %rsi
> - add $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rsi, %r9
> - sub %rdi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_fwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger_in_fwd)
> - mov %rdx, %rcx
> -L(bigger_in_fwd):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy_fwd)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy_fwd)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy_fwd):
> - sub $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 0x200(%rsi)
> - prefetcht0 0x300(%rsi)
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lfence
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - movntdq %xmm4, 0x40(%rdi)
> - movntdq %xmm5, 0x50(%rdi)
> - movntdq %xmm6, 0x60(%rdi)
> - movntdq %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_mem_fwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_fwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy_fwd):
> - add %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> - prefetchnta 0x1c0(%rsi)
> - prefetchnta 0x280(%rsi)
> - prefetchnta 0x1c0(%rdi)
> - prefetchnta 0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - movdqa %xmm2, 0x20(%rdi)
> - movdqa %xmm3, 0x30(%rdi)
> - movdqa %xmm4, 0x40(%rdi)
> - movdqa %xmm5, 0x50(%rdi)
> - movdqa %xmm6, 0x60(%rdi)
> - movdqa %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_bwd):
> - add %rdx, %rsi
> - add %rdx, %rdi
> -
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $-16, %rdi
> - sub %rdi, %r9
> - sub %r9, %rsi
> - sub %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rdi, %r9
> - sub %rsi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_bwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger)
> - mov %rdx, %rcx
> -L(bigger):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy):
> - sub $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 -0x200(%rsi)
> - prefetcht0 -0x300(%rsi)
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - lfence
> - movntdq %xmm1, -0x10(%rdi)
> - movntdq %xmm2, -0x20(%rdi)
> - movntdq %xmm3, -0x30(%rdi)
> - movntdq %xmm4, -0x40(%rdi)
> - movntdq %xmm5, -0x50(%rdi)
> - movntdq %xmm6, -0x60(%rdi)
> - movntdq %xmm7, -0x70(%rdi)
> - movntdq %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_mem_bwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_bwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy):
> - add %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop):
> - prefetchnta -0x1c0(%rsi)
> - prefetchnta -0x280(%rsi)
> - prefetchnta -0x1c0(%rdi)
> - prefetchnta -0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - movdqa %xmm1, -0x10(%rdi)
> - movdqa %xmm2, -0x20(%rdi)
> - movdqa %xmm3, -0x30(%rdi)
> - movdqa %xmm4, -0x40(%rdi)
> - movdqa %xmm5, -0x50(%rdi)
> - movdqa %xmm6, -0x60(%rdi)
> - movdqa %xmm7, -0x70(%rdi)
> - movdqa %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rsi
> - sub %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(fwd_write_128bytes):
> - lddqu -128(%rsi), %xmm0
> - movdqu %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> - lddqu -112(%rsi), %xmm0
> - movdqu %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> - lddqu -96(%rsi), %xmm0
> - movdqu %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> - lddqu -80(%rsi), %xmm0
> - movdqu %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> - lddqu -64(%rsi), %xmm0
> - movdqu %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> - lddqu -48(%rsi), %xmm0
> - movdqu %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> - lddqu -32(%rsi), %xmm0
> - movdqu %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> - lddqu -16(%rsi), %xmm0
> - movdqu %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> - ret
> -
> -
> - .p2align 4
> -L(fwd_write_143bytes):
> - lddqu -143(%rsi), %xmm0
> - movdqu %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> - lddqu -127(%rsi), %xmm0
> - movdqu %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> - lddqu -111(%rsi), %xmm0
> - movdqu %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> - lddqu -95(%rsi), %xmm0
> - movdqu %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> - lddqu -79(%rsi), %xmm0
> - movdqu %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> - lddqu -63(%rsi), %xmm0
> - movdqu %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> - lddqu -47(%rsi), %xmm0
> - movdqu %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> - lddqu -31(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -31(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_15bytes):
> - mov -15(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -15(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_142bytes):
> - lddqu -142(%rsi), %xmm0
> - movdqu %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> - lddqu -126(%rsi), %xmm0
> - movdqu %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> - lddqu -110(%rsi), %xmm0
> - movdqu %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> - lddqu -94(%rsi), %xmm0
> - movdqu %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> - lddqu -78(%rsi), %xmm0
> - movdqu %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> - lddqu -62(%rsi), %xmm0
> - movdqu %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> - lddqu -46(%rsi), %xmm0
> - movdqu %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> - lddqu -30(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -30(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_14bytes):
> - mov -14(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -14(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_141bytes):
> - lddqu -141(%rsi), %xmm0
> - movdqu %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> - lddqu -125(%rsi), %xmm0
> - movdqu %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> - lddqu -109(%rsi), %xmm0
> - movdqu %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> - lddqu -93(%rsi), %xmm0
> - movdqu %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> - lddqu -77(%rsi), %xmm0
> - movdqu %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> - lddqu -61(%rsi), %xmm0
> - movdqu %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> - lddqu -45(%rsi), %xmm0
> - movdqu %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> - lddqu -29(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -29(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_13bytes):
> - mov -13(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -13(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_140bytes):
> - lddqu -140(%rsi), %xmm0
> - movdqu %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> - lddqu -124(%rsi), %xmm0
> - movdqu %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> - lddqu -108(%rsi), %xmm0
> - movdqu %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> - lddqu -92(%rsi), %xmm0
> - movdqu %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> - lddqu -76(%rsi), %xmm0
> - movdqu %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> - lddqu -60(%rsi), %xmm0
> - movdqu %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> - lddqu -44(%rsi), %xmm0
> - movdqu %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> - lddqu -28(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -28(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_12bytes):
> - mov -12(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -12(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_139bytes):
> - lddqu -139(%rsi), %xmm0
> - movdqu %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> - lddqu -123(%rsi), %xmm0
> - movdqu %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> - lddqu -107(%rsi), %xmm0
> - movdqu %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> - lddqu -91(%rsi), %xmm0
> - movdqu %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> - lddqu -75(%rsi), %xmm0
> - movdqu %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> - lddqu -59(%rsi), %xmm0
> - movdqu %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> - lddqu -43(%rsi), %xmm0
> - movdqu %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> - lddqu -27(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -27(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_11bytes):
> - mov -11(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -11(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_138bytes):
> - lddqu -138(%rsi), %xmm0
> - movdqu %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> - lddqu -122(%rsi), %xmm0
> - movdqu %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> - lddqu -106(%rsi), %xmm0
> - movdqu %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> - lddqu -90(%rsi), %xmm0
> - movdqu %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> - lddqu -74(%rsi), %xmm0
> - movdqu %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> - lddqu -58(%rsi), %xmm0
> - movdqu %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> - lddqu -42(%rsi), %xmm0
> - movdqu %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> - lddqu -26(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -26(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_10bytes):
> - mov -10(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -10(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_137bytes):
> - lddqu -137(%rsi), %xmm0
> - movdqu %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> - lddqu -121(%rsi), %xmm0
> - movdqu %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> - lddqu -105(%rsi), %xmm0
> - movdqu %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> - lddqu -89(%rsi), %xmm0
> - movdqu %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> - lddqu -73(%rsi), %xmm0
> - movdqu %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> - lddqu -57(%rsi), %xmm0
> - movdqu %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> - lddqu -41(%rsi), %xmm0
> - movdqu %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> - lddqu -25(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -25(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_9bytes):
> - mov -9(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -9(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_136bytes):
> - lddqu -136(%rsi), %xmm0
> - movdqu %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> - lddqu -120(%rsi), %xmm0
> - movdqu %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> - lddqu -104(%rsi), %xmm0
> - movdqu %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> - lddqu -88(%rsi), %xmm0
> - movdqu %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> - lddqu -72(%rsi), %xmm0
> - movdqu %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> - lddqu -56(%rsi), %xmm0
> - movdqu %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> - lddqu -40(%rsi), %xmm0
> - movdqu %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> - lddqu -24(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -24(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_8bytes):
> - mov -8(%rsi), %rdx
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_135bytes):
> - lddqu -135(%rsi), %xmm0
> - movdqu %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> - lddqu -119(%rsi), %xmm0
> - movdqu %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> - lddqu -103(%rsi), %xmm0
> - movdqu %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> - lddqu -87(%rsi), %xmm0
> - movdqu %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> - lddqu -71(%rsi), %xmm0
> - movdqu %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> - lddqu -55(%rsi), %xmm0
> - movdqu %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> - lddqu -39(%rsi), %xmm0
> - movdqu %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> - lddqu -23(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -23(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_7bytes):
> - mov -7(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -7(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_134bytes):
> - lddqu -134(%rsi), %xmm0
> - movdqu %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> - lddqu -118(%rsi), %xmm0
> - movdqu %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> - lddqu -102(%rsi), %xmm0
> - movdqu %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> - lddqu -86(%rsi), %xmm0
> - movdqu %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> - lddqu -70(%rsi), %xmm0
> - movdqu %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> - lddqu -54(%rsi), %xmm0
> - movdqu %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> - lddqu -38(%rsi), %xmm0
> - movdqu %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> - lddqu -22(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -22(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_6bytes):
> - mov -6(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -6(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_133bytes):
> - lddqu -133(%rsi), %xmm0
> - movdqu %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> - lddqu -117(%rsi), %xmm0
> - movdqu %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> - lddqu -101(%rsi), %xmm0
> - movdqu %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> - lddqu -85(%rsi), %xmm0
> - movdqu %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> - lddqu -69(%rsi), %xmm0
> - movdqu %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> - lddqu -53(%rsi), %xmm0
> - movdqu %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> - lddqu -37(%rsi), %xmm0
> - movdqu %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> - lddqu -21(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -21(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_5bytes):
> - mov -5(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -5(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_132bytes):
> - lddqu -132(%rsi), %xmm0
> - movdqu %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> - lddqu -116(%rsi), %xmm0
> - movdqu %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> - lddqu -100(%rsi), %xmm0
> - movdqu %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> - lddqu -84(%rsi), %xmm0
> - movdqu %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> - lddqu -68(%rsi), %xmm0
> - movdqu %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> - lddqu -52(%rsi), %xmm0
> - movdqu %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> - lddqu -36(%rsi), %xmm0
> - movdqu %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> - lddqu -20(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -20(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_4bytes):
> - mov -4(%rsi), %edx
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_131bytes):
> - lddqu -131(%rsi), %xmm0
> - movdqu %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> - lddqu -115(%rsi), %xmm0
> - movdqu %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> - lddqu -99(%rsi), %xmm0
> - movdqu %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> - lddqu -83(%rsi), %xmm0
> - movdqu %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> - lddqu -67(%rsi), %xmm0
> - movdqu %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> - lddqu -51(%rsi), %xmm0
> - movdqu %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> - lddqu -35(%rsi), %xmm0
> - movdqu %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> - lddqu -19(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -19(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_3bytes):
> - mov -3(%rsi), %dx
> - mov -2(%rsi), %cx
> - mov %dx, -3(%rdi)
> - mov %cx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_130bytes):
> - lddqu -130(%rsi), %xmm0
> - movdqu %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> - lddqu -114(%rsi), %xmm0
> - movdqu %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> - lddqu -98(%rsi), %xmm0
> - movdqu %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> - lddqu -82(%rsi), %xmm0
> - movdqu %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> - lddqu -66(%rsi), %xmm0
> - movdqu %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> - lddqu -50(%rsi), %xmm0
> - movdqu %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> - lddqu -34(%rsi), %xmm0
> - movdqu %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> - lddqu -18(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -18(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_2bytes):
> - movzwl -2(%rsi), %edx
> - mov %dx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_129bytes):
> - lddqu -129(%rsi), %xmm0
> - movdqu %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> - lddqu -113(%rsi), %xmm0
> - movdqu %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> - lddqu -97(%rsi), %xmm0
> - movdqu %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> - lddqu -81(%rsi), %xmm0
> - movdqu %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> - lddqu -65(%rsi), %xmm0
> - movdqu %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> - lddqu -49(%rsi), %xmm0
> - movdqu %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> - lddqu -33(%rsi), %xmm0
> - movdqu %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> - lddqu -17(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -17(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_1bytes):
> - movzbl -1(%rsi), %edx
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_128bytes):
> - lddqu 112(%rsi), %xmm0
> - movdqu %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> - lddqu 96(%rsi), %xmm0
> - movdqu %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> - lddqu 80(%rsi), %xmm0
> - movdqu %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> - lddqu 64(%rsi), %xmm0
> - movdqu %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> - lddqu 48(%rsi), %xmm0
> - movdqu %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> - lddqu 32(%rsi), %xmm0
> - movdqu %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> - lddqu 16(%rsi), %xmm0
> - movdqu %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> - lddqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> - ret
> -
> - .p2align 4
> -L(bwd_write_143bytes):
> - lddqu 127(%rsi), %xmm0
> - movdqu %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> - lddqu 111(%rsi), %xmm0
> - movdqu %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> - lddqu 95(%rsi), %xmm0
> - movdqu %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> - lddqu 79(%rsi), %xmm0
> - movdqu %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> - lddqu 63(%rsi), %xmm0
> - movdqu %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> - lddqu 47(%rsi), %xmm0
> - movdqu %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> - lddqu 31(%rsi), %xmm0
> - movdqu %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> - lddqu 15(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 15(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> -
> - .p2align 4
> -L(bwd_write_15bytes):
> - mov 7(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 7(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_142bytes):
> - lddqu 126(%rsi), %xmm0
> - movdqu %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> - lddqu 110(%rsi), %xmm0
> - movdqu %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> - lddqu 94(%rsi), %xmm0
> - movdqu %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> - lddqu 78(%rsi), %xmm0
> - movdqu %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> - lddqu 62(%rsi), %xmm0
> - movdqu %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> - lddqu 46(%rsi), %xmm0
> - movdqu %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> - lddqu 30(%rsi), %xmm0
> - movdqu %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> - lddqu 14(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 14(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_14bytes):
> - mov 6(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 6(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_141bytes):
> - lddqu 125(%rsi), %xmm0
> - movdqu %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> - lddqu 109(%rsi), %xmm0
> - movdqu %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> - lddqu 93(%rsi), %xmm0
> - movdqu %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> - lddqu 77(%rsi), %xmm0
> - movdqu %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> - lddqu 61(%rsi), %xmm0
> - movdqu %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> - lddqu 45(%rsi), %xmm0
> - movdqu %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> - lddqu 29(%rsi), %xmm0
> - movdqu %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> - lddqu 13(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 13(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_13bytes):
> - mov 5(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 5(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_140bytes):
> - lddqu 124(%rsi), %xmm0
> - movdqu %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> - lddqu 108(%rsi), %xmm0
> - movdqu %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> - lddqu 92(%rsi), %xmm0
> - movdqu %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> - lddqu 76(%rsi), %xmm0
> - movdqu %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> - lddqu 60(%rsi), %xmm0
> - movdqu %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> - lddqu 44(%rsi), %xmm0
> - movdqu %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> - lddqu 28(%rsi), %xmm0
> - movdqu %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> - lddqu 12(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 12(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_12bytes):
> - mov 4(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 4(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_139bytes):
> - lddqu 123(%rsi), %xmm0
> - movdqu %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> - lddqu 107(%rsi), %xmm0
> - movdqu %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> - lddqu 91(%rsi), %xmm0
> - movdqu %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> - lddqu 75(%rsi), %xmm0
> - movdqu %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> - lddqu 59(%rsi), %xmm0
> - movdqu %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> - lddqu 43(%rsi), %xmm0
> - movdqu %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> - lddqu 27(%rsi), %xmm0
> - movdqu %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> - lddqu 11(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 11(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_11bytes):
> - mov 3(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 3(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_138bytes):
> - lddqu 122(%rsi), %xmm0
> - movdqu %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> - lddqu 106(%rsi), %xmm0
> - movdqu %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> - lddqu 90(%rsi), %xmm0
> - movdqu %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> - lddqu 74(%rsi), %xmm0
> - movdqu %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> - lddqu 58(%rsi), %xmm0
> - movdqu %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> - lddqu 42(%rsi), %xmm0
> - movdqu %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> - lddqu 26(%rsi), %xmm0
> - movdqu %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> - lddqu 10(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 10(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_10bytes):
> - mov 2(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 2(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_137bytes):
> - lddqu 121(%rsi), %xmm0
> - movdqu %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> - lddqu 105(%rsi), %xmm0
> - movdqu %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> - lddqu 89(%rsi), %xmm0
> - movdqu %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> - lddqu 73(%rsi), %xmm0
> - movdqu %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> - lddqu 57(%rsi), %xmm0
> - movdqu %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> - lddqu 41(%rsi), %xmm0
> - movdqu %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> - lddqu 25(%rsi), %xmm0
> - movdqu %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> - lddqu 9(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 9(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_9bytes):
> - mov 1(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 1(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_136bytes):
> - lddqu 120(%rsi), %xmm0
> - movdqu %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> - lddqu 104(%rsi), %xmm0
> - movdqu %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> - lddqu 88(%rsi), %xmm0
> - movdqu %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> - lddqu 72(%rsi), %xmm0
> - movdqu %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> - lddqu 56(%rsi), %xmm0
> - movdqu %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> - lddqu 40(%rsi), %xmm0
> - movdqu %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> - lddqu 24(%rsi), %xmm0
> - movdqu %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> - lddqu 8(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 8(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_8bytes):
> - mov (%rsi), %rdx
> - mov %rdx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_135bytes):
> - lddqu 119(%rsi), %xmm0
> - movdqu %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> - lddqu 103(%rsi), %xmm0
> - movdqu %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> - lddqu 87(%rsi), %xmm0
> - movdqu %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> - lddqu 71(%rsi), %xmm0
> - movdqu %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> - lddqu 55(%rsi), %xmm0
> - movdqu %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> - lddqu 39(%rsi), %xmm0
> - movdqu %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> - lddqu 23(%rsi), %xmm0
> - movdqu %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> - lddqu 7(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 7(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_7bytes):
> - mov 3(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 3(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_134bytes):
> - lddqu 118(%rsi), %xmm0
> - movdqu %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> - lddqu 102(%rsi), %xmm0
> - movdqu %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> - lddqu 86(%rsi), %xmm0
> - movdqu %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> - lddqu 70(%rsi), %xmm0
> - movdqu %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> - lddqu 54(%rsi), %xmm0
> - movdqu %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> - lddqu 38(%rsi), %xmm0
> - movdqu %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> - lddqu 22(%rsi), %xmm0
> - movdqu %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> - lddqu 6(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 6(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_6bytes):
> - mov 2(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 2(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_133bytes):
> - lddqu 117(%rsi), %xmm0
> - movdqu %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> - lddqu 101(%rsi), %xmm0
> - movdqu %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> - lddqu 85(%rsi), %xmm0
> - movdqu %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> - lddqu 69(%rsi), %xmm0
> - movdqu %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> - lddqu 53(%rsi), %xmm0
> - movdqu %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> - lddqu 37(%rsi), %xmm0
> - movdqu %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> - lddqu 21(%rsi), %xmm0
> - movdqu %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> - lddqu 5(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 5(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_5bytes):
> - mov 1(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 1(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_132bytes):
> - lddqu 116(%rsi), %xmm0
> - movdqu %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> - lddqu 100(%rsi), %xmm0
> - movdqu %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> - lddqu 84(%rsi), %xmm0
> - movdqu %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> - lddqu 68(%rsi), %xmm0
> - movdqu %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> - lddqu 52(%rsi), %xmm0
> - movdqu %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> - lddqu 36(%rsi), %xmm0
> - movdqu %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> - lddqu 20(%rsi), %xmm0
> - movdqu %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> - lddqu 4(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 4(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_4bytes):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_131bytes):
> - lddqu 115(%rsi), %xmm0
> - movdqu %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> - lddqu 99(%rsi), %xmm0
> - movdqu %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> - lddqu 83(%rsi), %xmm0
> - movdqu %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> - lddqu 67(%rsi), %xmm0
> - movdqu %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> - lddqu 51(%rsi), %xmm0
> - movdqu %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> - lddqu 35(%rsi), %xmm0
> - movdqu %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> - lddqu 19(%rsi), %xmm0
> - movdqu %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> - lddqu 3(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 3(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_3bytes):
> - mov 1(%rsi), %dx
> - mov (%rsi), %cx
> - mov %dx, 1(%rdi)
> - mov %cx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_130bytes):
> - lddqu 114(%rsi), %xmm0
> - movdqu %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> - lddqu 98(%rsi), %xmm0
> - movdqu %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> - lddqu 82(%rsi), %xmm0
> - movdqu %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> - lddqu 66(%rsi), %xmm0
> - movdqu %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> - lddqu 50(%rsi), %xmm0
> - movdqu %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> - lddqu 34(%rsi), %xmm0
> - movdqu %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> - lddqu 18(%rsi), %xmm0
> - movdqu %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> - lddqu 2(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 2(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_2bytes):
> - movzwl (%rsi), %edx
> - mov %dx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_129bytes):
> - lddqu 113(%rsi), %xmm0
> - movdqu %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> - lddqu 97(%rsi), %xmm0
> - movdqu %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> - lddqu 81(%rsi), %xmm0
> - movdqu %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> - lddqu 65(%rsi), %xmm0
> - movdqu %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> - lddqu 49(%rsi), %xmm0
> - movdqu %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> - lddqu 33(%rsi), %xmm0
> - movdqu %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> - lddqu 17(%rsi), %xmm0
> - movdqu %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> - lddqu 1(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 1(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_1bytes):
> - movzbl (%rsi), %edx
> - mov %dl, (%rdi)
> - ret
> -
> -END (MEMCPY)
> -
> - .section .rodata.ssse3,"a",@progbits
> - .p2align 3
> -L(table_144_bytes_bwd):
> - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> - .p2align 3
> -L(table_144_bytes_fwd):
> - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> - .p2align 3
> -L(shl_table_fwd):
> - .int JMPTBL (L(shl_0), L(shl_table_fwd))
> - .int JMPTBL (L(shl_1), L(shl_table_fwd))
> - .int JMPTBL (L(shl_2), L(shl_table_fwd))
> - .int JMPTBL (L(shl_3), L(shl_table_fwd))
> - .int JMPTBL (L(shl_4), L(shl_table_fwd))
> - .int JMPTBL (L(shl_5), L(shl_table_fwd))
> - .int JMPTBL (L(shl_6), L(shl_table_fwd))
> - .int JMPTBL (L(shl_7), L(shl_table_fwd))
> - .int JMPTBL (L(shl_8), L(shl_table_fwd))
> - .int JMPTBL (L(shl_9), L(shl_table_fwd))
> - .int JMPTBL (L(shl_10), L(shl_table_fwd))
> - .int JMPTBL (L(shl_11), L(shl_table_fwd))
> - .int JMPTBL (L(shl_12), L(shl_table_fwd))
> - .int JMPTBL (L(shl_13), L(shl_table_fwd))
> - .int JMPTBL (L(shl_14), L(shl_table_fwd))
> - .int JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> - .p2align 3
> -L(shl_table_bwd):
> - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY __memmove_ssse3_back
> -#define MEMCPY_CHK __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3
2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
> generally preferable. memcpy/memmove is one exception where avoiding
> unaligned loads with `palignr` is important for some targets.
>
> This commit replaces memmove-ssse3 with a better optimized are lower
> code footprint verion. As well it aliases memcpy to memmove.
>
> Aside from this function all other SSSE3 functions should be safe to
> remove.
>
> The performance is not changed drastically although shows overall
> improvements without any major regressions or gains.
>
> bench-memcpy geometric_mean(N=50) New / Original: 0.962
>
> bench-memcpy-random geometric_mean(N=50) New / Original: 0.895
>
> bench-memcpy-large geometric_mean(N=50) New / Original: 0.894
>
> Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
> for all results.
>
> More important this saves 7246 bytes of code size in memmove an
> additional 10741 bytes by reusing memmove code for memcpy (total 17987
> bytes saves). As well an additional 896 bytes of rodata for the jump
> table entries.
> ---
> sysdeps/x86_64/multiarch/Makefile | 1 -
> sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ----------------------
> sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++-
> 3 files changed, 382 insertions(+), 3156 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 303fb5d734..e7ea963fc0 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,6 @@ sysdep_routines += \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> memcmpeq-sse2 \
> - memcpy-ssse3 \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> deleted file mode 100644
> index 65644d3a09..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> +++ /dev/null
> @@ -1,3151 +0,0 @@
> -/* memcpy with SSSE3
> - Copyright (C) 2010-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY __memcpy_ssse3
> -# define MEMCPY_CHK __memcpy_chk_ssse3
> -# define MEMPCPY __mempcpy_ssse3
> -# define MEMPCPY_CHK __mempcpy_chk_ssse3
> -#endif
> -
> -#define JMPTBL(I, B) I - B
> -
> -/* Branch to an entry in a jump table. TABLE is a jump table with
> - relative offsets. INDEX is a register contains the index into the
> - jump table. SCALE is the scale of INDEX. */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), INDEX; \
> - lea (%r11, INDEX), INDEX; \
> - _CET_NOTRACK jmp *INDEX; \
> - ud2
> -
> - .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> - mov %RDI_LP, %RAX_LP
> - add %RDX_LP, %RAX_LP
> - jmp L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> - mov %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> - add %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> - cmp %rsi, %rdi
> - jb L(copy_forward)
> - je L(write_0bytes)
> - cmp $79, %rdx
> - jbe L(copy_forward)
> - jmp L(copy_backward)
> -L(copy_forward):
> -#endif
> -L(start):
> - cmp $79, %rdx
> - lea L(table_less_80bytes)(%rip), %r11
> - ja L(80bytesormore)
> - movslq (%r11, %rdx, 4), %r9
> - add %rdx, %rsi
> - add %rdx, %rdi
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(80bytesormore):
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jle L(copy_backward)
> -#endif
> -
> - movdqu (%rsi), %xmm0
> - mov %rdi, %rcx
> - and $-16, %rdi
> - add $16, %rdi
> - mov %rcx, %r8
> - sub %rdi, %rcx
> - add %rcx, %rdx
> - sub %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> - cmp %rcx, %rdx
> - mov %rsi, %r9
> - ja L(large_page_fwd)
> - and $0xf, %r9
> - jz L(shl_0)
> -#ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> - BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
> -
> - .p2align 4
> -L(copy_backward):
> - movdqu -16(%rsi, %rdx), %xmm0
> - add %rdx, %rsi
> - lea -16(%rdi, %rdx), %r8
> - add %rdx, %rdi
> -
> - mov %rdi, %rcx
> - and $0xf, %rcx
> - xor %rcx, %rdi
> - sub %rcx, %rdx
> - sub %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -
> - cmp %rcx, %rdx
> - mov %rsi, %r9
> - ja L(large_page_bwd)
> - and $0xf, %r9
> - jz L(shl_0_bwd)
> -#ifdef DATA_CACHE_SIZE_HALF
> - mov $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> - BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
> -
> - .p2align 4
> -L(shl_0):
> - sub $16, %rdx
> - movdqa (%rsi), %xmm1
> - add $16, %rsi
> - movdqa %xmm1, (%rdi)
> - add $16, %rdi
> - cmp $128, %rdx
> - movdqu %xmm0, (%r8)
> - ja L(shl_0_gobble)
> - cmp $64, %rdx
> - jb L(shl_0_less_64bytes)
> - movaps (%rsi), %xmm4
> - movaps 16(%rsi), %xmm1
> - movaps 32(%rsi), %xmm2
> - movaps 48(%rsi), %xmm3
> - movaps %xmm4, (%rdi)
> - movaps %xmm1, 16(%rdi)
> - movaps %xmm2, 32(%rdi)
> - movaps %xmm3, 48(%rdi)
> - sub $64, %rdx
> - add $64, %rsi
> - add $64, %rdi
> -L(shl_0_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble):
> -#ifdef DATA_CACHE_SIZE_HALF
> - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> - lea -128(%rdx), %rdx
> - jae L(shl_0_gobble_mem_loop)
> -L(shl_0_gobble_cache_loop):
> - movdqa (%rsi), %xmm4
> - movaps 0x10(%rsi), %xmm1
> - movaps 0x20(%rsi), %xmm2
> - movaps 0x30(%rsi), %xmm3
> -
> - movdqa %xmm4, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> -
> - sub $128, %rdx
> - movaps 0x40(%rsi), %xmm4
> - movaps 0x50(%rsi), %xmm5
> - movaps 0x60(%rsi), %xmm6
> - movaps 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> - movaps %xmm4, 0x40(%rdi)
> - movaps %xmm5, 0x50(%rdi)
> - movaps %xmm6, 0x60(%rdi)
> - movaps %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_cache_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_cache_less_64bytes)
> -
> - movdqa (%rsi), %xmm4
> - sub $0x40, %rdx
> - movdqa 0x10(%rsi), %xmm1
> -
> - movdqa %xmm4, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> -
> - movdqa 0x20(%rsi), %xmm4
> - movdqa 0x30(%rsi), %xmm1
> - add $0x40, %rsi
> -
> - movdqa %xmm4, 0x20(%rdi)
> - movdqa %xmm1, 0x30(%rdi)
> - add $0x40, %rdi
> -L(shl_0_cache_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_mem_loop):
> - prefetcht0 0x1c0(%rsi)
> - prefetcht0 0x280(%rsi)
> -
> - movdqa (%rsi), %xmm0
> - movdqa 0x10(%rsi), %xmm1
> - movdqa 0x20(%rsi), %xmm2
> - movdqa 0x30(%rsi), %xmm3
> - movdqa 0x40(%rsi), %xmm4
> - movdqa 0x50(%rsi), %xmm5
> - movdqa 0x60(%rsi), %xmm6
> - movdqa 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> - sub $0x80, %rdx
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - movdqa %xmm2, 0x20(%rdi)
> - movdqa %xmm3, 0x30(%rdi)
> - movdqa %xmm4, 0x40(%rdi)
> - movdqa %xmm5, 0x50(%rdi)
> - movdqa %xmm6, 0x60(%rdi)
> - movdqa %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_mem_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_mem_less_64bytes)
> -
> - movdqa (%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa 0x10(%rsi), %xmm1
> -
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> -
> - movdqa 0x20(%rsi), %xmm0
> - movdqa 0x30(%rsi), %xmm1
> - add $0x40, %rsi
> -
> - movdqa %xmm0, 0x20(%rdi)
> - movdqa %xmm1, 0x30(%rdi)
> - add $0x40, %rdi
> -L(shl_0_mem_less_64bytes):
> - cmp $0x20, %rdx
> - jb L(shl_0_mem_less_32bytes)
> - movdqa (%rsi), %xmm0
> - sub $0x20, %rdx
> - movdqa 0x10(%rsi), %xmm1
> - add $0x20, %rsi
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - add $0x20, %rdi
> -L(shl_0_mem_less_32bytes):
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_bwd):
> - sub $16, %rdx
> - movdqa -0x10(%rsi), %xmm1
> - sub $16, %rsi
> - movdqa %xmm1, -0x10(%rdi)
> - sub $16, %rdi
> - cmp $0x80, %rdx
> - movdqu %xmm0, (%r8)
> - ja L(shl_0_gobble_bwd)
> - cmp $64, %rdx
> - jb L(shl_0_less_64bytes_bwd)
> - movaps -0x10(%rsi), %xmm0
> - movaps -0x20(%rsi), %xmm1
> - movaps -0x30(%rsi), %xmm2
> - movaps -0x40(%rsi), %xmm3
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - sub $64, %rdx
> - sub $0x40, %rsi
> - sub $0x40, %rdi
> -L(shl_0_less_64bytes_bwd):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_bwd):
> -#ifdef DATA_CACHE_SIZE_HALF
> - cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> - lea -128(%rdx), %rdx
> - jae L(shl_0_gobble_mem_bwd_loop)
> -L(shl_0_gobble_bwd_loop):
> - movdqa -0x10(%rsi), %xmm0
> - movaps -0x20(%rsi), %xmm1
> - movaps -0x30(%rsi), %xmm2
> - movaps -0x40(%rsi), %xmm3
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> -
> - sub $0x80, %rdx
> - movaps -0x50(%rsi), %xmm4
> - movaps -0x60(%rsi), %xmm5
> - movaps -0x70(%rsi), %xmm6
> - movaps -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> - movaps %xmm4, -0x50(%rdi)
> - movaps %xmm5, -0x60(%rdi)
> - movaps %xmm6, -0x70(%rdi)
> - movaps %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_gobble_bwd_less_64bytes)
> -
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa -0x20(%rsi), %xmm1
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> -
> - movdqa -0x30(%rsi), %xmm0
> - movdqa -0x40(%rsi), %xmm1
> - sub $0x40, %rsi
> -
> - movdqa %xmm0, -0x30(%rdi)
> - movdqa %xmm1, -0x40(%rdi)
> - sub $0x40, %rdi
> -L(shl_0_gobble_bwd_less_64bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_gobble_mem_bwd_loop):
> - prefetcht0 -0x1c0(%rsi)
> - prefetcht0 -0x280(%rsi)
> - movdqa -0x10(%rsi), %xmm0
> - movdqa -0x20(%rsi), %xmm1
> - movdqa -0x30(%rsi), %xmm2
> - movdqa -0x40(%rsi), %xmm3
> - movdqa -0x50(%rsi), %xmm4
> - movdqa -0x60(%rsi), %xmm5
> - movdqa -0x70(%rsi), %xmm6
> - movdqa -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> - sub $0x80, %rdx
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> - movdqa %xmm2, -0x30(%rdi)
> - movdqa %xmm3, -0x40(%rdi)
> - movdqa %xmm4, -0x50(%rdi)
> - movdqa %xmm5, -0x60(%rdi)
> - movdqa %xmm6, -0x70(%rdi)
> - movdqa %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> -
> - jae L(shl_0_gobble_mem_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(shl_0_mem_bwd_less_64bytes)
> -
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x40, %rdx
> - movdqa -0x20(%rsi), %xmm1
> -
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> -
> - movdqa -0x30(%rsi), %xmm0
> - movdqa -0x40(%rsi), %xmm1
> - sub $0x40, %rsi
> -
> - movdqa %xmm0, -0x30(%rdi)
> - movdqa %xmm1, -0x40(%rdi)
> - sub $0x40, %rdi
> -L(shl_0_mem_bwd_less_64bytes):
> - cmp $0x20, %rdx
> - jb L(shl_0_mem_bwd_less_32bytes)
> - movdqa -0x10(%rsi), %xmm0
> - sub $0x20, %rdx
> - movdqa -0x20(%rsi), %xmm1
> - sub $0x20, %rsi
> - movdqa %xmm0, -0x10(%rdi)
> - movdqa %xmm1, -0x20(%rdi)
> - sub $0x20, %rdi
> -L(shl_0_mem_bwd_less_32bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1):
> - lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x01(%rsi), %xmm1
> - jb L(L1_fwd)
> - lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
> -L(L1_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_1_loop_L1):
> - sub $64, %rdx
> - movaps 0x0f(%rsi), %xmm2
> - movaps 0x1f(%rsi), %xmm3
> - movaps 0x2f(%rsi), %xmm4
> - movaps 0x3f(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $1, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $1, %xmm3, %xmm4
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $1, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_1_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1_bwd):
> - lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x01(%rsi), %xmm1
> - jb L(L1_bwd)
> - lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
> -L(L1_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_1_bwd_loop_L1):
> - movaps -0x11(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x21(%rsi), %xmm3
> - movaps -0x31(%rsi), %xmm4
> - movaps -0x41(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $1, %xmm2, %xmm1
> - palignr $1, %xmm3, %xmm2
> - palignr $1, %xmm4, %xmm3
> - palignr $1, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_1_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_1_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2):
> - lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x02(%rsi), %xmm1
> - jb L(L2_fwd)
> - lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
> -L(L2_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_2_loop_L1):
> - sub $64, %rdx
> - movaps 0x0e(%rsi), %xmm2
> - movaps 0x1e(%rsi), %xmm3
> - movaps 0x2e(%rsi), %xmm4
> - movaps 0x3e(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $2, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $2, %xmm3, %xmm4
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $2, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_2_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2_bwd):
> - lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x02(%rsi), %xmm1
> - jb L(L2_bwd)
> - lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
> -L(L2_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_2_bwd_loop_L1):
> - movaps -0x12(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x22(%rsi), %xmm3
> - movaps -0x32(%rsi), %xmm4
> - movaps -0x42(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $2, %xmm2, %xmm1
> - palignr $2, %xmm3, %xmm2
> - palignr $2, %xmm4, %xmm3
> - palignr $2, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_2_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_2_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3):
> - lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x03(%rsi), %xmm1
> - jb L(L3_fwd)
> - lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
> -L(L3_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_3_loop_L1):
> - sub $64, %rdx
> - movaps 0x0d(%rsi), %xmm2
> - movaps 0x1d(%rsi), %xmm3
> - movaps 0x2d(%rsi), %xmm4
> - movaps 0x3d(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $3, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $3, %xmm3, %xmm4
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $3, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_3_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3_bwd):
> - lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x03(%rsi), %xmm1
> - jb L(L3_bwd)
> - lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
> -L(L3_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_3_bwd_loop_L1):
> - movaps -0x13(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x23(%rsi), %xmm3
> - movaps -0x33(%rsi), %xmm4
> - movaps -0x43(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $3, %xmm2, %xmm1
> - palignr $3, %xmm3, %xmm2
> - palignr $3, %xmm4, %xmm3
> - palignr $3, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_3_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_3_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4):
> - lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x04(%rsi), %xmm1
> - jb L(L4_fwd)
> - lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
> -L(L4_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_4_loop_L1):
> - sub $64, %rdx
> - movaps 0x0c(%rsi), %xmm2
> - movaps 0x1c(%rsi), %xmm3
> - movaps 0x2c(%rsi), %xmm4
> - movaps 0x3c(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $4, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $4, %xmm3, %xmm4
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $4, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_4_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4_bwd):
> - lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x04(%rsi), %xmm1
> - jb L(L4_bwd)
> - lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
> -L(L4_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_4_bwd_loop_L1):
> - movaps -0x14(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x24(%rsi), %xmm3
> - movaps -0x34(%rsi), %xmm4
> - movaps -0x44(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $4, %xmm2, %xmm1
> - palignr $4, %xmm3, %xmm2
> - palignr $4, %xmm4, %xmm3
> - palignr $4, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_4_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_4_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5):
> - lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x05(%rsi), %xmm1
> - jb L(L5_fwd)
> - lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
> -L(L5_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_5_loop_L1):
> - sub $64, %rdx
> - movaps 0x0b(%rsi), %xmm2
> - movaps 0x1b(%rsi), %xmm3
> - movaps 0x2b(%rsi), %xmm4
> - movaps 0x3b(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $5, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $5, %xmm3, %xmm4
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $5, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_5_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5_bwd):
> - lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x05(%rsi), %xmm1
> - jb L(L5_bwd)
> - lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
> -L(L5_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_5_bwd_loop_L1):
> - movaps -0x15(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x25(%rsi), %xmm3
> - movaps -0x35(%rsi), %xmm4
> - movaps -0x45(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $5, %xmm2, %xmm1
> - palignr $5, %xmm3, %xmm2
> - palignr $5, %xmm4, %xmm3
> - palignr $5, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_5_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_5_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6):
> - lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x06(%rsi), %xmm1
> - jb L(L6_fwd)
> - lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
> -L(L6_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_6_loop_L1):
> - sub $64, %rdx
> - movaps 0x0a(%rsi), %xmm2
> - movaps 0x1a(%rsi), %xmm3
> - movaps 0x2a(%rsi), %xmm4
> - movaps 0x3a(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $6, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $6, %xmm3, %xmm4
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $6, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_6_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6_bwd):
> - lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x06(%rsi), %xmm1
> - jb L(L6_bwd)
> - lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
> -L(L6_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_6_bwd_loop_L1):
> - movaps -0x16(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x26(%rsi), %xmm3
> - movaps -0x36(%rsi), %xmm4
> - movaps -0x46(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $6, %xmm2, %xmm1
> - palignr $6, %xmm3, %xmm2
> - palignr $6, %xmm4, %xmm3
> - palignr $6, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_6_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_6_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7):
> - lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x07(%rsi), %xmm1
> - jb L(L7_fwd)
> - lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
> -L(L7_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_7_loop_L1):
> - sub $64, %rdx
> - movaps 0x09(%rsi), %xmm2
> - movaps 0x19(%rsi), %xmm3
> - movaps 0x29(%rsi), %xmm4
> - movaps 0x39(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $7, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $7, %xmm3, %xmm4
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $7, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_7_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7_bwd):
> - lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x07(%rsi), %xmm1
> - jb L(L7_bwd)
> - lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
> -L(L7_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_7_bwd_loop_L1):
> - movaps -0x17(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x27(%rsi), %xmm3
> - movaps -0x37(%rsi), %xmm4
> - movaps -0x47(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $7, %xmm2, %xmm1
> - palignr $7, %xmm3, %xmm2
> - palignr $7, %xmm4, %xmm3
> - palignr $7, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_7_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_7_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8):
> - lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x08(%rsi), %xmm1
> - jb L(L8_fwd)
> - lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
> -L(L8_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> -L(shl_8_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_8_loop_L1):
> - sub $64, %rdx
> - movaps 0x08(%rsi), %xmm2
> - movaps 0x18(%rsi), %xmm3
> - movaps 0x28(%rsi), %xmm4
> - movaps 0x38(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $8, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $8, %xmm3, %xmm4
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $8, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_8_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> - .p2align 4
> -L(shl_8_end):
> - lea 64(%rdx), %rdx
> - movaps %xmm4, -0x20(%rdi)
> - add %rdx, %rsi
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8_bwd):
> - lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x08(%rsi), %xmm1
> - jb L(L8_bwd)
> - lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
> -L(L8_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_8_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_8_bwd_loop_L1):
> - movaps -0x18(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x28(%rsi), %xmm3
> - movaps -0x38(%rsi), %xmm4
> - movaps -0x48(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $8, %xmm2, %xmm1
> - palignr $8, %xmm3, %xmm2
> - palignr $8, %xmm4, %xmm3
> - palignr $8, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_8_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_8_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9):
> - lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x09(%rsi), %xmm1
> - jb L(L9_fwd)
> - lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
> -L(L9_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_9_loop_L1):
> - sub $64, %rdx
> - movaps 0x07(%rsi), %xmm2
> - movaps 0x17(%rsi), %xmm3
> - movaps 0x27(%rsi), %xmm4
> - movaps 0x37(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $9, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $9, %xmm3, %xmm4
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $9, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_9_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9_bwd):
> - lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x09(%rsi), %xmm1
> - jb L(L9_bwd)
> - lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
> -L(L9_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_9_bwd_loop_L1):
> - movaps -0x19(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x29(%rsi), %xmm3
> - movaps -0x39(%rsi), %xmm4
> - movaps -0x49(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $9, %xmm2, %xmm1
> - palignr $9, %xmm3, %xmm2
> - palignr $9, %xmm4, %xmm3
> - palignr $9, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_9_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_9_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10):
> - lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - jb L(L10_fwd)
> - lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
> -L(L10_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_10_loop_L1):
> - sub $64, %rdx
> - movaps 0x06(%rsi), %xmm2
> - movaps 0x16(%rsi), %xmm3
> - movaps 0x26(%rsi), %xmm4
> - movaps 0x36(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $10, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $10, %xmm3, %xmm4
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $10, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_10_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10_bwd):
> - lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - jb L(L10_bwd)
> - lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
> -L(L10_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_10_bwd_loop_L1):
> - movaps -0x1a(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2a(%rsi), %xmm3
> - movaps -0x3a(%rsi), %xmm4
> - movaps -0x4a(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $10, %xmm2, %xmm1
> - palignr $10, %xmm3, %xmm2
> - palignr $10, %xmm4, %xmm3
> - palignr $10, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_10_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_10_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11):
> - lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - jb L(L11_fwd)
> - lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
> -L(L11_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_11_loop_L1):
> - sub $64, %rdx
> - movaps 0x05(%rsi), %xmm2
> - movaps 0x15(%rsi), %xmm3
> - movaps 0x25(%rsi), %xmm4
> - movaps 0x35(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $11, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $11, %xmm3, %xmm4
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $11, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_11_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11_bwd):
> - lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - jb L(L11_bwd)
> - lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
> -L(L11_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_11_bwd_loop_L1):
> - movaps -0x1b(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2b(%rsi), %xmm3
> - movaps -0x3b(%rsi), %xmm4
> - movaps -0x4b(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $11, %xmm2, %xmm1
> - palignr $11, %xmm3, %xmm2
> - palignr $11, %xmm4, %xmm3
> - palignr $11, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_11_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_11_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12):
> - lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0c(%rsi), %xmm1
> - jb L(L12_fwd)
> - lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
> -L(L12_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_12_loop_L1):
> - sub $64, %rdx
> - movaps 0x04(%rsi), %xmm2
> - movaps 0x14(%rsi), %xmm3
> - movaps 0x24(%rsi), %xmm4
> - movaps 0x34(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $12, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $12, %xmm3, %xmm4
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $12, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_12_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12_bwd):
> - lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0c(%rsi), %xmm1
> - jb L(L12_bwd)
> - lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
> -L(L12_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_12_bwd_loop_L1):
> - movaps -0x1c(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2c(%rsi), %xmm3
> - movaps -0x3c(%rsi), %xmm4
> - movaps -0x4c(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $12, %xmm2, %xmm1
> - palignr $12, %xmm3, %xmm2
> - palignr $12, %xmm4, %xmm3
> - palignr $12, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_12_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_12_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13):
> - lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - jb L(L13_fwd)
> - lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
> -L(L13_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_13_loop_L1):
> - sub $64, %rdx
> - movaps 0x03(%rsi), %xmm2
> - movaps 0x13(%rsi), %xmm3
> - movaps 0x23(%rsi), %xmm4
> - movaps 0x33(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $13, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $13, %xmm3, %xmm4
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $13, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_13_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13_bwd):
> - lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - jb L(L13_bwd)
> - lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
> -L(L13_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_13_bwd_loop_L1):
> - movaps -0x1d(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2d(%rsi), %xmm3
> - movaps -0x3d(%rsi), %xmm4
> - movaps -0x4d(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $13, %xmm2, %xmm1
> - palignr $13, %xmm3, %xmm2
> - palignr $13, %xmm4, %xmm3
> - palignr $13, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_13_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_13_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14):
> - lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - jb L(L14_fwd)
> - lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
> -L(L14_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_14_loop_L1):
> - sub $64, %rdx
> - movaps 0x02(%rsi), %xmm2
> - movaps 0x12(%rsi), %xmm3
> - movaps 0x22(%rsi), %xmm4
> - movaps 0x32(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $14, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $14, %xmm3, %xmm4
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $14, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_14_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14_bwd):
> - lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - jb L(L14_bwd)
> - lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
> -L(L14_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_14_bwd_loop_L1):
> - movaps -0x1e(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2e(%rsi), %xmm3
> - movaps -0x3e(%rsi), %xmm4
> - movaps -0x4e(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $14, %xmm2, %xmm1
> - palignr $14, %xmm3, %xmm2
> - palignr $14, %xmm4, %xmm3
> - palignr $14, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_14_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_14_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15):
> - lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - jb L(L15_fwd)
> - lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
> -L(L15_fwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_loop_L2):
> - prefetchnta 0x1c0(%rsi)
> -L(shl_15_loop_L1):
> - sub $64, %rdx
> - movaps 0x01(%rsi), %xmm2
> - movaps 0x11(%rsi), %xmm3
> - movaps 0x21(%rsi), %xmm4
> - movaps 0x31(%rsi), %xmm5
> - movdqa %xmm5, %xmm6
> - palignr $15, %xmm4, %xmm5
> - lea 64(%rsi), %rsi
> - palignr $15, %xmm3, %xmm4
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rdi), %rdi
> - palignr $15, %xmm1, %xmm2
> - movdqa %xmm6, %xmm1
> - movdqa %xmm2, -0x40(%rdi)
> - movaps %xmm3, -0x30(%rdi)
> - jb L(shl_15_end)
> - movaps %xmm4, -0x20(%rdi)
> - movaps %xmm5, -0x10(%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_end):
> - movaps %xmm4, -0x20(%rdi)
> - lea 64(%rdx), %rdx
> - movaps %xmm5, -0x10(%rdi)
> - add %rdx, %rdi
> - movdqu %xmm0, (%r8)
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15_bwd):
> - lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
> - cmp %rcx, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - jb L(L15_bwd)
> - lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
> -L(L15_bwd):
> - lea -64(%rdx), %rdx
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_bwd_loop_L2):
> - prefetchnta -0x1c0(%rsi)
> -L(shl_15_bwd_loop_L1):
> - movaps -0x1f(%rsi), %xmm2
> - sub $0x40, %rdx
> - movaps -0x2f(%rsi), %xmm3
> - movaps -0x3f(%rsi), %xmm4
> - movaps -0x4f(%rsi), %xmm5
> - lea -0x40(%rsi), %rsi
> - palignr $15, %xmm2, %xmm1
> - palignr $15, %xmm3, %xmm2
> - palignr $15, %xmm4, %xmm3
> - palignr $15, %xmm5, %xmm4
> -
> - movaps %xmm1, -0x10(%rdi)
> - movaps %xmm5, %xmm1
> -
> - movaps %xmm2, -0x20(%rdi)
> - lea -0x40(%rdi), %rdi
> -
> - movaps %xmm3, 0x10(%rdi)
> - jb L(shl_15_bwd_end)
> - movaps %xmm4, (%rdi)
> - _CET_NOTRACK jmp *%r9
> - ud2
> -L(shl_15_bwd_end):
> - movaps %xmm4, (%rdi)
> - lea 64(%rdx), %rdx
> - movdqu %xmm0, (%r8)
> - BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> - .p2align 4
> -L(write_72bytes):
> - movdqu -72(%rsi), %xmm0
> - movdqu -56(%rsi), %xmm1
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rcx
> - movdqu %xmm0, -72(%rdi)
> - movdqu %xmm1, -56(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_64bytes):
> - movdqu -64(%rsi), %xmm0
> - mov -48(%rsi), %rcx
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -64(%rdi)
> - mov %rcx, -48(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_56bytes):
> - movdqu -56(%rsi), %xmm0
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rcx
> - movdqu %xmm0, -56(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_48bytes):
> - mov -48(%rsi), %rcx
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %rcx, -48(%rdi)
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_40bytes):
> - mov -40(%rsi), %r8
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r8, -40(%rdi)
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_32bytes):
> - mov -32(%rsi), %r9
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r9, -32(%rdi)
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_24bytes):
> - mov -24(%rsi), %r10
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r10, -24(%rdi)
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_16bytes):
> - mov -16(%rsi), %r11
> - mov -8(%rsi), %rdx
> - mov %r11, -16(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_8bytes):
> - mov -8(%rsi), %rdx
> - mov %rdx, -8(%rdi)
> -L(write_0bytes):
> - ret
> -
> - .p2align 4
> -L(write_73bytes):
> - movdqu -73(%rsi), %xmm0
> - movdqu -57(%rsi), %xmm1
> - mov -41(%rsi), %rcx
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %r8
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -73(%rdi)
> - movdqu %xmm1, -57(%rdi)
> - mov %rcx, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %r8, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_65bytes):
> - movdqu -65(%rsi), %xmm0
> - movdqu -49(%rsi), %xmm1
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -65(%rdi)
> - movdqu %xmm1, -49(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_57bytes):
> - movdqu -57(%rsi), %xmm0
> - mov -41(%rsi), %r8
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -57(%rdi)
> - mov %r8, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_49bytes):
> - movdqu -49(%rsi), %xmm0
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -49(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_41bytes):
> - mov -41(%rsi), %r8
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r8, -41(%rdi)
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_33bytes):
> - mov -33(%rsi), %r9
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r9, -33(%rdi)
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_25bytes):
> - mov -25(%rsi), %r10
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -1(%rsi), %dl
> - mov %r10, -25(%rdi)
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_17bytes):
> - mov -17(%rsi), %r11
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -17(%rdi)
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_9bytes):
> - mov -9(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -9(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_1bytes):
> - mov -1(%rsi), %dl
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_74bytes):
> - movdqu -74(%rsi), %xmm0
> - movdqu -58(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -74(%rdi)
> - movdqu %xmm1, -58(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_66bytes):
> - movdqu -66(%rsi), %xmm0
> - movdqu -50(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -66(%rdi)
> - movdqu %xmm1, -50(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_58bytes):
> - movdqu -58(%rsi), %xmm1
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm1, -58(%rdi)
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_50bytes):
> - movdqu -50(%rsi), %xmm0
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -50(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_42bytes):
> - mov -42(%rsi), %r8
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -42(%rdi)
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_34bytes):
> - mov -34(%rsi), %r9
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -34(%rdi)
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_26bytes):
> - mov -26(%rsi), %r10
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -26(%rdi)
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_18bytes):
> - mov -18(%rsi), %r11
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -18(%rdi)
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_10bytes):
> - mov -10(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -10(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_2bytes):
> - mov -2(%rsi), %dx
> - mov %dx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_75bytes):
> - movdqu -75(%rsi), %xmm0
> - movdqu -59(%rsi), %xmm1
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -75(%rdi)
> - movdqu %xmm1, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_67bytes):
> - movdqu -67(%rsi), %xmm0
> - movdqu -59(%rsi), %xmm1
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -67(%rdi)
> - movdqu %xmm1, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_59bytes):
> - movdqu -59(%rsi), %xmm0
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -59(%rdi)
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_51bytes):
> - movdqu -51(%rsi), %xmm0
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -51(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_43bytes):
> - mov -43(%rsi), %r8
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -43(%rdi)
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_35bytes):
> - mov -35(%rsi), %r9
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -35(%rdi)
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_27bytes):
> - mov -27(%rsi), %r10
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -27(%rdi)
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_19bytes):
> - mov -19(%rsi), %r11
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -19(%rdi)
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_11bytes):
> - mov -11(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -11(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_3bytes):
> - mov -3(%rsi), %dx
> - mov -2(%rsi), %cx
> - mov %dx, -3(%rdi)
> - mov %cx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_76bytes):
> - movdqu -76(%rsi), %xmm0
> - movdqu -60(%rsi), %xmm1
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -76(%rdi)
> - movdqu %xmm1, -60(%rdi)
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_68bytes):
> - movdqu -68(%rsi), %xmm0
> - movdqu -52(%rsi), %xmm1
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -68(%rdi)
> - movdqu %xmm1, -52(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_60bytes):
> - movdqu -60(%rsi), %xmm0
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -60(%rdi)
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_52bytes):
> - movdqu -52(%rsi), %xmm0
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - movdqu %xmm0, -52(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_44bytes):
> - mov -44(%rsi), %r8
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r8, -44(%rdi)
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_36bytes):
> - mov -36(%rsi), %r9
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r9, -36(%rdi)
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_28bytes):
> - mov -28(%rsi), %r10
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r10, -28(%rdi)
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_20bytes):
> - mov -20(%rsi), %r11
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %r11, -20(%rdi)
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_12bytes):
> - mov -12(%rsi), %rcx
> - mov -4(%rsi), %edx
> - mov %rcx, -12(%rdi)
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_4bytes):
> - mov -4(%rsi), %edx
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_77bytes):
> - movdqu -77(%rsi), %xmm0
> - movdqu -61(%rsi), %xmm1
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -77(%rdi)
> - movdqu %xmm1, -61(%rdi)
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_69bytes):
> - movdqu -69(%rsi), %xmm0
> - movdqu -53(%rsi), %xmm1
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -69(%rdi)
> - movdqu %xmm1, -53(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_61bytes):
> - movdqu -61(%rsi), %xmm0
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -61(%rdi)
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_53bytes):
> - movdqu -53(%rsi), %xmm0
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -53(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_45bytes):
> - mov -45(%rsi), %r8
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -45(%rdi)
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_37bytes):
> - mov -37(%rsi), %r9
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -37(%rdi)
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_29bytes):
> - mov -29(%rsi), %r10
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -29(%rdi)
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_21bytes):
> - mov -21(%rsi), %r11
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -21(%rdi)
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_13bytes):
> - mov -13(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -13(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_5bytes):
> - mov -5(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -5(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_78bytes):
> - movdqu -78(%rsi), %xmm0
> - movdqu -62(%rsi), %xmm1
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -78(%rdi)
> - movdqu %xmm1, -62(%rdi)
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_70bytes):
> - movdqu -70(%rsi), %xmm0
> - movdqu -54(%rsi), %xmm1
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -70(%rdi)
> - movdqu %xmm1, -54(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_62bytes):
> - movdqu -62(%rsi), %xmm0
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -62(%rdi)
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_54bytes):
> - movdqu -54(%rsi), %xmm0
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -54(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_46bytes):
> - mov -46(%rsi), %r8
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -46(%rdi)
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_38bytes):
> - mov -38(%rsi), %r9
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -38(%rdi)
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_30bytes):
> - mov -30(%rsi), %r10
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -30(%rdi)
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_22bytes):
> - mov -22(%rsi), %r11
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -22(%rdi)
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_14bytes):
> - mov -14(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -14(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_6bytes):
> - mov -6(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -6(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_79bytes):
> - movdqu -79(%rsi), %xmm0
> - movdqu -63(%rsi), %xmm1
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -79(%rdi)
> - movdqu %xmm1, -63(%rdi)
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_71bytes):
> - movdqu -71(%rsi), %xmm0
> - movdqu -55(%rsi), %xmm1
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -71(%rdi)
> - movdqu %xmm1, -55(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_63bytes):
> - movdqu -63(%rsi), %xmm0
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -63(%rdi)
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_55bytes):
> - movdqu -55(%rsi), %xmm0
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - movdqu %xmm0, -55(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_47bytes):
> - mov -47(%rsi), %r8
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r8, -47(%rdi)
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_39bytes):
> - mov -39(%rsi), %r9
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r9, -39(%rdi)
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_31bytes):
> - mov -31(%rsi), %r10
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r10, -31(%rdi)
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_23bytes):
> - mov -23(%rsi), %r11
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %r11, -23(%rdi)
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_15bytes):
> - mov -15(%rsi), %rcx
> - mov -8(%rsi), %rdx
> - mov %rcx, -15(%rdi)
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(write_7bytes):
> - mov -7(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -7(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(large_page_fwd):
> - movdqu (%rsi), %xmm1
> - lea 16(%rsi), %rsi
> - movdqu %xmm0, (%r8)
> - movntdq %xmm1, (%rdi)
> - lea 16(%rdi), %rdi
> - lea -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> - mov %rsi, %r9
> - sub %rdi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_fwd)
> - shl $2, %rcx
> - cmp %rcx, %rdx
> - jb L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -L(large_page_loop):
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - movntdq %xmm4, 0x40(%rdi)
> - movntdq %xmm5, 0x50(%rdi)
> - movntdq %xmm6, 0x60(%rdi)
> - movntdq %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(large_page_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_less_64bytes)
> -
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - lea 0x40(%rsi), %rsi
> -
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - lea 0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_less_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - sfence
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> - .p2align 4
> -L(ll_cache_copy_fwd_start):
> - prefetcht0 0x1c0(%rsi)
> - prefetcht0 0x200(%rsi)
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lea 0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movaps %xmm0, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> - movaps %xmm4, 0x40(%rdi)
> - movaps %xmm5, 0x50(%rdi)
> - movaps %xmm6, 0x60(%rdi)
> - movaps %xmm7, 0x70(%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(ll_cache_copy_fwd_start)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_ll_less_fwd_64bytes)
> -
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - lea 0x40(%rsi), %rsi
> -
> - movaps %xmm0, (%rdi)
> - movaps %xmm1, 0x10(%rdi)
> - movaps %xmm2, 0x20(%rdi)
> - movaps %xmm3, 0x30(%rdi)
> - lea 0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_ll_less_fwd_64bytes):
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#endif
> - .p2align 4
> -L(large_page_bwd):
> - movdqu -0x10(%rsi), %xmm1
> - lea -16(%rsi), %rsi
> - movdqu %xmm0, (%r8)
> - movdqa %xmm1, -0x10(%rdi)
> - lea -16(%rdi), %rdi
> - lea -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> - mov %rdi, %r9
> - sub %rsi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_bwd)
> - cmp %rcx, %r9
> - jb L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -L(large_page_bwd_loop):
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - movdqu -0x50(%rsi), %xmm4
> - movdqu -0x60(%rsi), %xmm5
> - movdqu -0x70(%rsi), %xmm6
> - movdqu -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movntdq %xmm0, -0x10(%rdi)
> - movntdq %xmm1, -0x20(%rdi)
> - movntdq %xmm2, -0x30(%rdi)
> - movntdq %xmm3, -0x40(%rdi)
> - movntdq %xmm4, -0x50(%rdi)
> - movntdq %xmm5, -0x60(%rdi)
> - movntdq %xmm6, -0x70(%rdi)
> - movntdq %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> - jae L(large_page_bwd_loop)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_less_bwd_64bytes)
> -
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - lea -0x40(%rsi), %rsi
> -
> - movntdq %xmm0, -0x10(%rdi)
> - movntdq %xmm1, -0x20(%rdi)
> - movntdq %xmm2, -0x30(%rdi)
> - movntdq %xmm3, -0x40(%rdi)
> - lea -0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_less_bwd_64bytes):
> - sfence
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> - .p2align 4
> -L(ll_cache_copy_bwd_start):
> - prefetcht0 -0x1c0(%rsi)
> - prefetcht0 -0x200(%rsi)
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - movdqu -0x50(%rsi), %xmm4
> - movdqu -0x60(%rsi), %xmm5
> - movdqu -0x70(%rsi), %xmm6
> - movdqu -0x80(%rsi), %xmm7
> - lea -0x80(%rsi), %rsi
> -
> - sub $0x80, %rdx
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - movaps %xmm4, -0x50(%rdi)
> - movaps %xmm5, -0x60(%rdi)
> - movaps %xmm6, -0x70(%rdi)
> - movaps %xmm7, -0x80(%rdi)
> - lea -0x80(%rdi), %rdi
> - jae L(ll_cache_copy_bwd_start)
> - cmp $-0x40, %rdx
> - lea 0x80(%rdx), %rdx
> - jl L(large_page_ll_less_bwd_64bytes)
> -
> - movdqu -0x10(%rsi), %xmm0
> - movdqu -0x20(%rsi), %xmm1
> - movdqu -0x30(%rsi), %xmm2
> - movdqu -0x40(%rsi), %xmm3
> - lea -0x40(%rsi), %rsi
> -
> - movaps %xmm0, -0x10(%rdi)
> - movaps %xmm1, -0x20(%rdi)
> - movaps %xmm2, -0x30(%rdi)
> - movaps %xmm3, -0x40(%rdi)
> - lea -0x40(%rdi), %rdi
> - sub $0x40, %rdx
> -L(large_page_ll_less_bwd_64bytes):
> - BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -#endif
> -
> -END (MEMCPY)
> -
> - .section .rodata.ssse3,"a",@progbits
> - .p2align 3
> -L(table_less_80bytes):
> - .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
> - .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
> -
> - .p2align 3
> -L(shl_table):
> - .int JMPTBL (L(shl_0), L(shl_table))
> - .int JMPTBL (L(shl_1), L(shl_table))
> - .int JMPTBL (L(shl_2), L(shl_table))
> - .int JMPTBL (L(shl_3), L(shl_table))
> - .int JMPTBL (L(shl_4), L(shl_table))
> - .int JMPTBL (L(shl_5), L(shl_table))
> - .int JMPTBL (L(shl_6), L(shl_table))
> - .int JMPTBL (L(shl_7), L(shl_table))
> - .int JMPTBL (L(shl_8), L(shl_table))
> - .int JMPTBL (L(shl_9), L(shl_table))
> - .int JMPTBL (L(shl_10), L(shl_table))
> - .int JMPTBL (L(shl_11), L(shl_table))
> - .int JMPTBL (L(shl_12), L(shl_table))
> - .int JMPTBL (L(shl_13), L(shl_table))
> - .int JMPTBL (L(shl_14), L(shl_table))
> - .int JMPTBL (L(shl_15), L(shl_table))
> -
> - .p2align 3
> -L(shl_table_bwd):
> - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> index 295430b1ef..84e4e0f6cb 100644
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> @@ -1,4 +1,382 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY __memmove_ssse3
> -#define MEMCPY_CHK __memmove_chk_ssse3
> -#include "memcpy-ssse3.S"
> +#include <sysdep.h>
> +
> +#ifndef MEMMOVE
> +# define MEMMOVE __memmove_ssse3
> +# define MEMMOVE_CHK __memmove_chk_ssse3
> +# define MEMCPY __memcpy_ssse3
> +# define MEMCPY_CHK __memcpy_chk_ssse3
> +# define MEMPCPY __mempcpy_ssse3
> +# define MEMPCPY_CHK __mempcpy_chk_ssse3
> +#endif
> +
> + .section .text.ssse3, "ax", @progbits
> +ENTRY(MEMPCPY_CHK)
> + cmp %RDX_LP, %RCX_LP
> + jb HIDDEN_JUMPTARGET(__chk_fail)
> +END(MEMPCPY_CHK)
> +
> +ENTRY(MEMPCPY)
> + mov %RDI_LP, %RAX_LP
> + add %RDX_LP, %RAX_LP
> + jmp L(start)
> +END(MEMPCPY)
> +
> +ENTRY(MEMMOVE_CHK)
> + cmp %RDX_LP, %RCX_LP
> + jb HIDDEN_JUMPTARGET(__chk_fail)
> +END(MEMMOVE_CHK)
> +
> +ENTRY_P2ALIGN(MEMMOVE, 6)
> + movq %rdi, %rax
> +L(start):
> + cmpq $16, %rdx
> + jb L(copy_0_15)
> +
> + /* These loads are always useful. */
> + movups 0(%rsi), %xmm0
> + movups -16(%rsi, %rdx), %xmm7
> + cmpq $32, %rdx
> + ja L(more_2x_vec)
> +
> + movups %xmm0, 0(%rdi)
> + movups %xmm7, -16(%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 8
> +L(copy_4x_vec):
> + movups 16(%rsi), %xmm1
> + movups -32(%rsi, %rdx), %xmm2
> +
> + movups %xmm0, 0(%rdi)
> + movups %xmm1, 16(%rdi)
> + movups %xmm2, -32(%rdi, %rdx)
> + movups %xmm7, -16(%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 8
> +L(copy_0_15):
> + cmpl $8, %edx
> + ja L(copy_9_15)
> +
> + cmpl $4, %edx
> + jb L(copy_0_3)
> +
> + movl 0(%rsi), %ecx
> + movl -4(%rsi, %rdx), %esi
> + movl %ecx, 0(%rdi)
> + movl %esi, -4(%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 8
> +L(copy_9_15):
> + movq 0(%rsi), %rcx
> + movq -8(%rsi, %rdx), %rsi
> + movq %rcx, 0(%rdi)
> + movq %rsi, -8(%rdi, %rdx)
> + ret
> +
> + .p2align 4,, 4
> +L(copy_0_3):
> + cmpl $1, %edx
> + jl L(copy_0_0)
> + movzbl (%rsi), %ecx
> + je L(copy_0_1)
> +
> + movzwl -2(%rsi, %rdx), %esi
> + movw %si, -2(%rdi, %rdx)
> +L(copy_0_1):
> + movb %cl, (%rdi)
> +L(copy_0_0):
> +L(nop):
> + ret
> +
> + .p2align 4
> +L(more_2x_vec):
> + cmpq $64, %rdx
> + jbe L(copy_4x_vec)
> +
> + /* We use rcx later to get alignr value. */
> + movq %rdi, %rcx
> +
> + /* Backward copy for overlap + dst > src for memmove safety. */
> + subq %rsi, %rcx
> + cmpq %rdx, %rcx
> + jb L(copy_backward)
> +
> + /* Load tail. */
> +
> + /* -16(%rsi, %rdx) already loaded into xmm7. */
> + movups -32(%rsi, %rdx), %xmm8
> + movups -48(%rsi, %rdx), %xmm9
> +
> + /* Get misalignment. */
> + andl $0xf, %ecx
> +
> + movq %rsi, %r9
> + addq %rcx, %rsi
> + andq $-16, %rsi
> + /* Get first vec for `palignr`. */
> + movaps (%rsi), %xmm1
> +
> + /* We have loaded (%rsi) so safe to do this store before the
> + loop. */
> + movups %xmm0, (%rdi)
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> + cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
> +#else
> + cmp __x86_shared_cache_size_half(%rip), %rdx
> +#endif
> + ja L(large_memcpy)
> +
> + leaq -64(%rdi, %rdx), %r8
> + andq $-16, %rdi
> + movl $48, %edx
> +
> + leaq L(loop_fwd_start)(%rip), %r9
> + sall $6, %ecx
> + addq %r9, %rcx
> + jmp * %rcx
> +
> + .p2align 4,, 8
> +L(copy_backward):
> + testq %rcx, %rcx
> + jz L(nop)
> +
> + /* Preload tail. */
> +
> + /* (%rsi) already loaded into xmm0. */
> + movups 16(%rsi), %xmm4
> + movups 32(%rsi), %xmm5
> +
> + movq %rdi, %r8
> + subq %rdi, %rsi
> + leaq -49(%rdi, %rdx), %rdi
> + andq $-16, %rdi
> + addq %rdi, %rsi
> + andq $-16, %rsi
> +
> + movaps 48(%rsi), %xmm6
> +
> +
> + leaq L(loop_bkwd_start)(%rip), %r9
> + andl $0xf, %ecx
> + sall $6, %ecx
> + addq %r9, %rcx
> + jmp * %rcx
> +
> + .p2align 4,, 8
> +L(large_memcpy):
> + movups -64(%r9, %rdx), %xmm10
> + movups -80(%r9, %rdx), %xmm11
> +
> + sall $5, %ecx
> + leal (%rcx, %rcx, 2), %r8d
> + leaq -96(%rdi, %rdx), %rcx
> + andq $-16, %rdi
> + leaq L(large_loop_fwd_start)(%rip), %rdx
> + addq %r8, %rdx
> + jmp * %rdx
> +
> +
> + /* Instead of a typical jump table all 16 loops are exactly
> + 64-bytes in size. So, we can just jump to first loop + r8 *
> + 64. Before modifying any loop ensure all their sizes match!
> + */
> + .p2align 6
> +L(loop_fwd_start):
> +L(loop_fwd_0x0):
> + movaps 16(%rsi), %xmm1
> + movaps 32(%rsi), %xmm2
> + movaps 48(%rsi), %xmm3
> + movaps %xmm1, 16(%rdi)
> + movaps %xmm2, 32(%rdi)
> + movaps %xmm3, 48(%rdi)
> + addq %rdx, %rdi
> + addq %rdx, %rsi
> + cmpq %rdi, %r8
> + ja L(loop_fwd_0x0)
> +L(end_loop_fwd):
> + movups %xmm9, 16(%r8)
> + movups %xmm8, 32(%r8)
> + movups %xmm7, 48(%r8)
> + ret
> +
> + /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
> + 60 bytes otherwise. */
> +#define ALIGNED_LOOP_FWD(align_by); \
> + .p2align 6; \
> +L(loop_fwd_ ## align_by): \
> + movaps 16(%rsi), %xmm0; \
> + movaps 32(%rsi), %xmm2; \
> + movaps 48(%rsi), %xmm3; \
> + movaps %xmm3, %xmm4; \
> + palignr $align_by, %xmm2, %xmm3; \
> + palignr $align_by, %xmm0, %xmm2; \
> + palignr $align_by, %xmm1, %xmm0; \
> + movaps %xmm4, %xmm1; \
> + movaps %xmm0, 16(%rdi); \
> + movaps %xmm2, 32(%rdi); \
> + movaps %xmm3, 48(%rdi); \
> + addq %rdx, %rdi; \
> + addq %rdx, %rsi; \
> + cmpq %rdi, %r8; \
> + ja L(loop_fwd_ ## align_by); \
> + jmp L(end_loop_fwd);
> +
> + /* Must be in descending order. */
> + ALIGNED_LOOP_FWD (0xf)
> + ALIGNED_LOOP_FWD (0xe)
> + ALIGNED_LOOP_FWD (0xd)
> + ALIGNED_LOOP_FWD (0xc)
> + ALIGNED_LOOP_FWD (0xb)
> + ALIGNED_LOOP_FWD (0xa)
> + ALIGNED_LOOP_FWD (0x9)
> + ALIGNED_LOOP_FWD (0x8)
> + ALIGNED_LOOP_FWD (0x7)
> + ALIGNED_LOOP_FWD (0x6)
> + ALIGNED_LOOP_FWD (0x5)
> + ALIGNED_LOOP_FWD (0x4)
> + ALIGNED_LOOP_FWD (0x3)
> + ALIGNED_LOOP_FWD (0x2)
> + ALIGNED_LOOP_FWD (0x1)
> +
> + .p2align 6
> +L(large_loop_fwd_start):
> +L(large_loop_fwd_0x0):
> + movaps 16(%rsi), %xmm1
> + movaps 32(%rsi), %xmm2
> + movaps 48(%rsi), %xmm3
> + movaps 64(%rsi), %xmm4
> + movaps 80(%rsi), %xmm5
> + movntps %xmm1, 16(%rdi)
> + movntps %xmm2, 32(%rdi)
> + movntps %xmm3, 48(%rdi)
> + movntps %xmm4, 64(%rdi)
> + movntps %xmm5, 80(%rdi)
> + addq $80, %rdi
> + addq $80, %rsi
> + cmpq %rdi, %rcx
> + ja L(large_loop_fwd_0x0)
> +
> + /* Ensure no icache line split on tail. */
> + .p2align 4
> +L(end_large_loop_fwd):
> + sfence
> + movups %xmm11, 16(%rcx)
> + movups %xmm10, 32(%rcx)
> + movups %xmm9, 48(%rcx)
> + movups %xmm8, 64(%rcx)
> + movups %xmm7, 80(%rcx)
> + ret
> +
> +
> + /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
> + 96-byte spacing between each. */
> +#define ALIGNED_LARGE_LOOP_FWD(align_by); \
> + .p2align 5; \
> +L(large_loop_fwd_ ## align_by): \
> + movaps 16(%rsi), %xmm0; \
> + movaps 32(%rsi), %xmm2; \
> + movaps 48(%rsi), %xmm3; \
> + movaps 64(%rsi), %xmm4; \
> + movaps 80(%rsi), %xmm5; \
> + movaps %xmm5, %xmm6; \
> + palignr $align_by, %xmm4, %xmm5; \
> + palignr $align_by, %xmm3, %xmm4; \
> + palignr $align_by, %xmm2, %xmm3; \
> + palignr $align_by, %xmm0, %xmm2; \
> + palignr $align_by, %xmm1, %xmm0; \
> + movaps %xmm6, %xmm1; \
> + movntps %xmm0, 16(%rdi); \
> + movntps %xmm2, 32(%rdi); \
> + movntps %xmm3, 48(%rdi); \
> + movntps %xmm4, 64(%rdi); \
> + movntps %xmm5, 80(%rdi); \
> + addq $80, %rdi; \
> + addq $80, %rsi; \
> + cmpq %rdi, %rcx; \
> + ja L(large_loop_fwd_ ## align_by); \
> + jmp L(end_large_loop_fwd);
> +
> + /* Must be in descending order. */
> + ALIGNED_LARGE_LOOP_FWD (0xf)
> + ALIGNED_LARGE_LOOP_FWD (0xe)
> + ALIGNED_LARGE_LOOP_FWD (0xd)
> + ALIGNED_LARGE_LOOP_FWD (0xc)
> + ALIGNED_LARGE_LOOP_FWD (0xb)
> + ALIGNED_LARGE_LOOP_FWD (0xa)
> + ALIGNED_LARGE_LOOP_FWD (0x9)
> + ALIGNED_LARGE_LOOP_FWD (0x8)
> + ALIGNED_LARGE_LOOP_FWD (0x7)
> + ALIGNED_LARGE_LOOP_FWD (0x6)
> + ALIGNED_LARGE_LOOP_FWD (0x5)
> + ALIGNED_LARGE_LOOP_FWD (0x4)
> + ALIGNED_LARGE_LOOP_FWD (0x3)
> + ALIGNED_LARGE_LOOP_FWD (0x2)
> + ALIGNED_LARGE_LOOP_FWD (0x1)
> +
> +
> + .p2align 6
> +L(loop_bkwd_start):
> +L(loop_bkwd_0x0):
> + movaps 32(%rsi), %xmm1
> + movaps 16(%rsi), %xmm2
> + movaps 0(%rsi), %xmm3
> + movaps %xmm1, 32(%rdi)
> + movaps %xmm2, 16(%rdi)
> + movaps %xmm3, 0(%rdi)
> + subq $48, %rdi
> + subq $48, %rsi
> + cmpq %rdi, %r8
> + jb L(loop_bkwd_0x0)
> +L(end_loop_bkwd):
> + movups %xmm7, -16(%r8, %rdx)
> + movups %xmm0, 0(%r8)
> + movups %xmm4, 16(%r8)
> + movups %xmm5, 32(%r8)
> +
> + ret
> +
> +
> + /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
> + 60 bytes otherwise. */
> +#define ALIGNED_LOOP_BKWD(align_by); \
> + .p2align 6; \
> +L(loop_bkwd_ ## align_by): \
> + movaps 32(%rsi), %xmm1; \
> + movaps 16(%rsi), %xmm2; \
> + movaps 0(%rsi), %xmm3; \
> + palignr $align_by, %xmm1, %xmm6; \
> + palignr $align_by, %xmm2, %xmm1; \
> + palignr $align_by, %xmm3, %xmm2; \
> + movaps %xmm6, 32(%rdi); \
> + movaps %xmm1, 16(%rdi); \
> + movaps %xmm2, 0(%rdi); \
> + subq $48, %rdi; \
> + subq $48, %rsi; \
> + movaps %xmm3, %xmm6; \
> + cmpq %rdi, %r8; \
> + jb L(loop_bkwd_ ## align_by); \
> + jmp L(end_loop_bkwd);
> +
> + /* Must be in descending order. */
> + ALIGNED_LOOP_BKWD (0xf)
> + ALIGNED_LOOP_BKWD (0xe)
> + ALIGNED_LOOP_BKWD (0xd)
> + ALIGNED_LOOP_BKWD (0xc)
> + ALIGNED_LOOP_BKWD (0xb)
> + ALIGNED_LOOP_BKWD (0xa)
> + ALIGNED_LOOP_BKWD (0x9)
> + ALIGNED_LOOP_BKWD (0x8)
> + ALIGNED_LOOP_BKWD (0x7)
> + ALIGNED_LOOP_BKWD (0x6)
> + ALIGNED_LOOP_BKWD (0x5)
> + ALIGNED_LOOP_BKWD (0x4)
> + ALIGNED_LOOP_BKWD (0x3)
> + ALIGNED_LOOP_BKWD (0x2)
> + ALIGNED_LOOP_BKWD (0x1)
> +END(MEMMOVE)
> +
> +strong_alias (MEMMOVE, MEMCPY)
> +strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S
2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> New code save size (-303 bytes) and has significantly better
> performance.
>
> geometric_mean(N=20) of page cross cases New / Original: 0.634
> ---
> sysdeps/x86_64/memcmp.S | 884 ++++++++++++++---------
> sysdeps/x86_64/memcmpeq.S | 2 +-
> sysdeps/x86_64/multiarch/Makefile | 2 +-
> sysdeps/x86_64/multiarch/memcmp-sse2.S | 4 +-
> sysdeps/x86_64/multiarch/memcmpeq-sse2.S | 4 +-
> sysdeps/x86_64/multiarch/wmemcmp-c.c | 9 -
> sysdeps/x86_64/multiarch/wmemcmp-sse2.S | 25 +
> sysdeps/x86_64/wmemcmp.S | 21 +
> 8 files changed, 575 insertions(+), 376 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
> create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
> create mode 100644 sysdeps/x86_64/wmemcmp.S
>
> diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
> index e02a53ea1e..b153694048 100644
> --- a/sysdeps/x86_64/memcmp.S
> +++ b/sysdeps/x86_64/memcmp.S
> @@ -18,395 +18,557 @@
>
> #include <sysdep.h>
>
> - .text
> -ENTRY (memcmp)
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - movl %edx, %edx
> +#ifdef USE_AS_WMEMCMP
> +# define PCMPEQ pcmpeqd
> +# define CHAR_SIZE 4
> +# define SIZE_OFFSET (0)
> +#else
> +# define PCMPEQ pcmpeqb
> +# define CHAR_SIZE 1
> #endif
> - test %RDX_LP, %RDX_LP
> - jz L(finz)
> - cmpq $1, %rdx
> - jbe L(finr1b)
> - subq %rdi, %rsi
> - movq %rdx, %r10
> - cmpq $32, %r10
> - jae L(gt32)
> - /* Handle small chunks and last block of less than 32 bytes. */
> -L(small):
> - testq $1, %r10
> - jz L(s2b)
> - movzbl (%rdi), %eax
> - movzbl (%rdi, %rsi), %edx
> - subq $1, %r10
> - je L(finz1)
> - addq $1, %rdi
> - subl %edx, %eax
> - jnz L(exit)
> -L(s2b):
> - testq $2, %r10
> - jz L(s4b)
> - movzwl (%rdi), %eax
> - movzwl (%rdi, %rsi), %edx
> - subq $2, %r10
> +
> #ifdef USE_AS_MEMCMPEQ
> - je L(finz1)
> +# define SIZE_OFFSET (0)
> +# define CHECK_CMP(x, y) subl x, y
> #else
> - je L(fin2_7)
> +# ifndef SIZE_OFFSET
> +# define SIZE_OFFSET (CHAR_PER_VEC * 2)
> +# endif
> +# define CHECK_CMP(x, y) cmpl x, y
> #endif
> - addq $2, %rdi
> - cmpl %edx, %eax
> -#ifdef USE_AS_MEMCMPEQ
> - jnz L(neq_early)
> +
> +#define VEC_SIZE 16
> +#define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
> +
> +#ifndef MEMCMP
> +# define MEMCMP memcmp
> +#endif
> +
> + .text
> +ENTRY(MEMCMP)
> +#ifdef USE_AS_WMEMCMP
> + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
> + in ecx for code size. This is preferable to using `incw` as
> + it avoids partial register stalls on older hardware (pre
> + SnB). */
> + movl $0xffff, %ecx
> +#endif
> + cmpq $CHAR_PER_VEC, %rdx
> + ja L(more_1x_vec)
> +
> +#ifdef USE_AS_WMEMCMP
> + /* saves a byte of code keeping the fall through path n = [2, 4]
> + in the initial cache line. */
> + decl %edx
> + jle L(cmp_0_1)
> +
> + movq (%rsi), %xmm0
> + movq (%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + subl %ecx, %eax
> + jnz L(ret_nonzero_vec_start_0)
> +
> + movq -4(%rsi, %rdx, CHAR_SIZE), %xmm0
> + movq -4(%rdi, %rdx, CHAR_SIZE), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + subl %ecx, %eax
> + jnz L(ret_nonzero_vec_end_0_adj)
> #else
> - jnz L(fin2_7)
> + cmpl $8, %edx
> + ja L(cmp_9_16)
> +
> + cmpl $4, %edx
> + jb L(cmp_0_3)
> +
> +# ifdef USE_AS_MEMCMPEQ
> + movl (%rsi), %eax
> + subl (%rdi), %eax
> +
> + movl -4(%rsi, %rdx), %esi
> + subl -4(%rdi, %rdx), %esi
> +
> + orl %esi, %eax
> + ret
> +# else
> + /* Combine comparisons for lo and hi 4-byte comparisons. */
> + movl -4(%rsi, %rdx), %ecx
> + movl -4(%rdi, %rdx), %eax
> + shlq $32, %rcx
> + shlq $32, %rax
> + movl (%rsi), %esi
> + movl (%rdi), %edi
> + orq %rsi, %rcx
> + orq %rdi, %rax
> + /* Only compute proper return if not-equal. */
> + cmpq %rcx, %rax
> + jnz L(ret_nonzero)
> + xorl %eax, %eax
> + ret
> +# endif
> +
> + .p2align 4,, 10
> +L(cmp_9_16):
> +# ifdef USE_AS_MEMCMPEQ
> + movq (%rsi), %rax
> + subq (%rdi), %rax
> +
> + movq -8(%rsi, %rdx), %rcx
> + subq -8(%rdi, %rdx), %rcx
> + orq %rcx, %rax
> + /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
> + return long). */
> + setnz %cl
> + movzbl %cl, %eax
> +# else
> + movq (%rsi), %rcx
> + movq (%rdi), %rax
> + /* Only compute proper return if not-equal. */
> + cmpq %rcx, %rax
> + jnz L(ret_nonzero)
> +
> + movq -8(%rsi, %rdx, CHAR_SIZE), %rcx
> + movq -8(%rdi, %rdx, CHAR_SIZE), %rax
> + /* Only compute proper return if not-equal. */
> + cmpq %rcx, %rax
> + jnz L(ret_nonzero)
> + xorl %eax, %eax
> +# endif
> #endif
> -L(s4b):
> - testq $4, %r10
> - jz L(s8b)
> - movl (%rdi), %eax
> - movl (%rdi, %rsi), %edx
> - subq $4, %r10
> -#ifdef USE_AS_MEMCMPEQ
> - je L(finz1)
> + ret
> +
> + .p2align 4,, 8
> +L(cmp_0_1):
> + /* Flag set by earlier comparison against 1. */
> + jne L(cmp_0_0)
> +#ifdef USE_AS_WMEMCMP
> + movl (%rdi), %ecx
> + xorl %edx, %edx
> + cmpl (%rsi), %ecx
> + je L(cmp_0_0)
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> #else
> - je L(fin2_7)
> + movzbl (%rdi), %eax
> + movzbl (%rsi), %ecx
> + subl %ecx, %eax
> #endif
> - addq $4, %rdi
> - cmpl %edx, %eax
> -#ifdef USE_AS_MEMCMPEQ
> - jnz L(neq_early)
> + ret
> +
> + /* Fits in aligning bytes. */
> +L(cmp_0_0):
> + xorl %eax, %eax
> + ret
> +
> +#ifdef USE_AS_WMEMCMP
> + .p2align 4
> +L(ret_nonzero_vec_start_0):
> + bsfl %eax, %eax
> + movl (%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> + ret
> +#else
> +
> +# ifndef USE_AS_MEMCMPEQ
> + .p2align 4,, 14
> +L(ret_nonzero):
> + /* Need to bswap to get proper return without branch. */
> + bswapq %rcx
> + bswapq %rax
> + subq %rcx, %rax
> + sbbl %eax, %eax
> + orl $1, %eax
> + ret
> +# endif
> +
> + .p2align 4
> +L(cmp_0_3):
> +# ifdef USE_AS_MEMCMPEQ
> + /* No reason to add to dependency chain on rdx. Saving a the
> + bytes here doesn't change number of fetch blocks. */
> + cmpl $1, %edx
> + jbe L(cmp_0_1)
> +# else
> + /* We need the code size to prevent taking an extra fetch block.
> + */
> + decl %edx
> + jle L(cmp_0_1)
> +# endif
> + movzwl (%rsi), %ecx
> + movzwl (%rdi), %eax
> +
> +# ifdef USE_AS_MEMCMPEQ
> + subl %ecx, %eax
> +
> + movzbl -1(%rsi, %rdx), %esi
> + movzbl -1(%rdi, %rdx), %edi
> + subl %edi, %esi
> + orl %esi, %eax
> +# else
> + bswapl %ecx
> + bswapl %eax
> +
> + /* Implicit right shift by one. We just need to displace the
> + sign bits. */
> + shrl %ecx
> + shrl %eax
> +
> + /* Eat a partial register stall here. Saves code stopping
> + L(cmp_0_3) from bleeding into the next fetch block and saves
> + an ALU. */
> + movb (%rsi, %rdx), %cl
> + movzbl (%rdi, %rdx), %edi
> + orl %edi, %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +#endif
> +
> + .p2align 5
> +L(more_1x_vec):
> +#ifndef USE_AS_WMEMCMP
> + /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
> + in ecx for code size. This is preferable to using `incw` as
> + it avoids partial register stalls on older hardware (pre
> + SnB). */
> + movl $0xffff, %ecx
> +#endif
> + movups (%rsi), %xmm0
> + movups (%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + subl %ecx, %eax
> + jnz L(ret_nonzero_vec_start_0)
> +#if SIZE_OFFSET == 0
> + cmpq $(CHAR_PER_VEC * 2), %rdx
> #else
> - jnz L(fin2_7)
> + /* Offset rdx. Saves just enough code size to keep the
> + L(last_2x_vec) case and the non-zero return in a single
> + cache line. */
> + subq $(CHAR_PER_VEC * 2), %rdx
> #endif
> -L(s8b):
> - testq $8, %r10
> - jz L(s16b)
> - movq (%rdi), %rax
> - movq (%rdi, %rsi), %rdx
> - subq $8, %r10
> -#ifdef USE_AS_MEMCMPEQ
> - je L(sub_return8)
> + ja L(more_2x_vec)
> +
> + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
> + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + subl %ecx, %eax
> +#ifndef USE_AS_MEMCMPEQ
> + /* Don't use `incw ax` as machines this code runs on are liable
> + to have partial register stall. */
> + jnz L(ret_nonzero_vec_end_0)
> #else
> - je L(fin2_7)
> + /* Various return targets for memcmpeq. Will always be hot in
> + Icache and get short encoding. */
> +L(ret_nonzero_vec_start_1):
> +L(ret_nonzero_vec_start_0):
> +L(ret_nonzero_vec_end_0):
> #endif
> - addq $8, %rdi
> - cmpq %rdx, %rax
> -#ifdef USE_AS_MEMCMPEQ
> - jnz L(neq_early)
> + ret
> +
> +#ifndef USE_AS_MEMCMPEQ
> +# ifdef USE_AS_WMEMCMP
> + .p2align 4
> +L(ret_nonzero_vec_end_0_adj):
> + addl $3, %edx
> +# else
> + .p2align 4,, 8
> +# endif
> +L(ret_nonzero_vec_end_0):
> + bsfl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + leal (%rax, %rdx, CHAR_SIZE), %eax
> + movl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + addl %edx, %eax
> + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +# ifndef USE_AS_WMEMCMP
> + .p2align 4,, 10
> +L(ret_nonzero_vec_start_0):
> + bsfl %eax, %eax
> + movzbl (%rsi, %rax), %ecx
> + movzbl (%rdi, %rax), %eax
> + subl %ecx, %eax
> + ret
> +# endif
> #else
> - jnz L(fin2_7)
> #endif
> -L(s16b):
> - movdqu (%rdi), %xmm1
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb %xmm0, %xmm1
> +
> + .p2align 5
> +L(more_2x_vec):
> + movups (VEC_SIZE * 1)(%rsi), %xmm0
> + movups (VEC_SIZE * 1)(%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + pmovmskb %xmm1, %eax
> + subl %ecx, %eax
> + jnz L(ret_nonzero_vec_start_1)
> +
> + cmpq $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
> + jbe L(last_2x_vec)
> +
> + cmpq $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
> + ja L(more_8x_vec)
> +
> + /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
> + This can harm performance if non-zero return in [65, 80] or
> + [97, 112] but helps performance otherwise. Generally zero-
> + return is hotter. */
> + movups (VEC_SIZE * 2)(%rsi), %xmm0
> + movups (VEC_SIZE * 2)(%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + movups (VEC_SIZE * 3)(%rsi), %xmm2
> + movups (VEC_SIZE * 3)(%rdi), %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pand %xmm1, %xmm3
> +
> + pmovmskb %xmm3, %eax
> + CHECK_CMP (%ecx, %eax)
> + jnz L(ret_nonzero_vec_start_2_3)
> +
> + cmpl $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
> + jbe L(last_2x_vec)
> +
> + movups (VEC_SIZE * 4)(%rsi), %xmm0
> + movups (VEC_SIZE * 4)(%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + movups (VEC_SIZE * 5)(%rsi), %xmm2
> + movups (VEC_SIZE * 5)(%rdi), %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pand %xmm1, %xmm3
> +
> + pmovmskb %xmm3, %eax
> + CHECK_CMP (%ecx, %eax)
> #ifdef USE_AS_MEMCMPEQ
> - pmovmskb %xmm1, %eax
> - subl $0xffff, %eax
> + jz L(last_2x_vec)
> ret
> #else
> - pmovmskb %xmm1, %edx
> - xorl %eax, %eax
> - subl $0xffff, %edx
> - jz L(finz)
> - bsfl %edx, %ecx
> - leaq (%rdi, %rcx), %rcx
> - movzbl (%rcx), %eax
> - movzbl (%rsi, %rcx), %edx
> - jmp L(finz1)
> + jnz L(ret_nonzero_vec_start_4_5)
> #endif
> - .p2align 4,, 4
> -L(finr1b):
> - movzbl (%rdi), %eax
> - movzbl (%rsi), %edx
> -L(finz1):
> - subl %edx, %eax
> -L(exit):
> - ret
> + .p2align 4
> +L(last_2x_vec):
> + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
> + movups (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
> + movups (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pand %xmm1, %xmm3
> + pmovmskb %xmm3, %eax
> + subl %ecx, %eax
> #ifdef USE_AS_MEMCMPEQ
> - .p2align 4,, 4
> -L(sub_return8):
> - subq %rdx, %rax
> - movl %eax, %edx
> - shrq $32, %rax
> - orl %edx, %eax
> + /* Various return targets for memcmpeq. Will always be hot in
> + Icache and get short encoding. */
> +L(ret_nonzero_vec_start_2_3):
> +L(ret_nonzero_vec_start_4_5):
> ret
> #else
> - .p2align 4,, 4
> -L(fin2_7):
> - cmpq %rdx, %rax
> - jz L(finz)
> - movq %rax, %r11
> - subq %rdx, %r11
> - bsfq %r11, %rcx
> - sarq $3, %rcx
> - salq $3, %rcx
> - sarq %cl, %rax
> - movzbl %al, %eax
> - sarq %cl, %rdx
> - movzbl %dl, %edx
> - subl %edx, %eax
> + jnz L(ret_nonzero_vec_end_1)
> ret
> -#endif
> - .p2align 4,, 4
> -L(finz):
> - xorl %eax, %eax
> +
> + .p2align 4,, 8
> +L(ret_nonzero_vec_end_1):
> + pmovmskb %xmm1, %ecx
> + /* High 16 bits of eax guranteed to be all ones. Rotate them in
> + to we can do `or + not` with just `xor`. */
> + rorl $16, %eax
> + xorl %ecx, %eax
> + /* Partial register stall. */
> +
> + bsfl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + leal (%rax, %rdx, CHAR_SIZE), %eax
> + movl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + addl %edx, %eax
> + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> ret
> -#ifdef USE_AS_MEMCMPEQ
> - .p2align 4,, 4
> -L(neq_early):
> - movl $1, %eax
> +
> + .p2align 4
> +L(ret_nonzero_vec_start_4_5):
> + pmovmskb %xmm1, %edx
> + sall $16, %eax
> + leal 1(%rax, %rdx), %eax
> + bsfl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + movl (VEC_SIZE * 4)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * 4)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (VEC_SIZE * 4)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * 4)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +
> + .p2align 4,, 8
> +L(ret_nonzero_vec_start_1):
> + bsfl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + movl (VEC_SIZE * 1)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * 1)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (VEC_SIZE * 1)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * 1)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> ret
> #endif
> - /* For blocks bigger than 32 bytes
> - 1. Advance one of the addr pointer to be 16B aligned.
> - 2. Treat the case of both addr pointers aligned to 16B
> - separately to avoid movdqu.
> - 3. Handle any blocks of greater than 64 consecutive bytes with
> - unrolling to reduce branches.
> - 4. At least one addr pointer is 16B aligned, use memory version
> - of pcmbeqb.
> - */
> - .p2align 4,, 4
> -L(gt32):
> - movq %rdx, %r11
> - addq %rdi, %r11
> - movq %rdi, %r8
> -
> - andq $15, %r8
> - jz L(16am)
> - /* Both pointers may be misaligned. */
> - movdqu (%rdi), %xmm1
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb %xmm0, %xmm1
> - pmovmskb %xmm1, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - neg %r8
> - leaq 16(%rdi, %r8), %rdi
> -L(16am):
> - /* Handle two 16B aligned pointers separately. */
> - testq $15, %rsi
> - jz L(ATR)
> - testq $16, %rdi
> - jz L(A32)
> - movdqu (%rdi, %rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -L(A32):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jae L(mt16)
> - /* Pre-unroll to be ready for unrolled 64B loop. */
> - testq $32, %rdi
> - jz L(A64)
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> -L(A64):
> - movq %r11, %r10
> - andq $-64, %r10
> - cmpq %r10, %rdi
> - jae L(mt32)
> -
> -L(A64main):
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %rdi, %r10
> - jne L(A64main)
> -
> -L(mt32):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jae L(mt16)
> -
> -L(A32main):
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqu (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %rdi, %r10
> - jne L(A32main)
> -L(mt16):
> - subq %rdi, %r11
> - je L(finz)
> - movq %r11, %r10
> - jmp L(small)
> -
> - .p2align 4,, 4
> -L(neq):
> -#ifdef USE_AS_MEMCMPEQ
> - movl $1, %eax
> - ret
> -#else
> - bsfl %edx, %ecx
> - movzbl (%rdi, %rcx), %eax
> - addq %rdi, %rsi
> - movzbl (%rsi,%rcx), %edx
> - jmp L(finz1)
> +
> + .p2align 4
> +L(more_8x_vec):
> + subq %rdi, %rsi
> + leaq (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
> + andq $(VEC_SIZE * -1), %rdi
> + addq %rdi, %rsi
> + .p2align 4
> +L(loop_4x):
> + movups (VEC_SIZE * 2)(%rsi), %xmm0
> + movups (VEC_SIZE * 3)(%rsi), %xmm1
> +
> + PCMPEQ (VEC_SIZE * 2)(%rdi), %xmm0
> + PCMPEQ (VEC_SIZE * 3)(%rdi), %xmm1
> +
> + movups (VEC_SIZE * 4)(%rsi), %xmm2
> + movups (VEC_SIZE * 5)(%rsi), %xmm3
> +
> + PCMPEQ (VEC_SIZE * 4)(%rdi), %xmm2
> + PCMPEQ (VEC_SIZE * 5)(%rdi), %xmm3
> +
> + pand %xmm0, %xmm1
> + pand %xmm2, %xmm3
> + pand %xmm1, %xmm3
> +
> + pmovmskb %xmm3, %eax
> + subl %ecx, %eax
> + jnz L(ret_nonzero_loop)
> +
> + addq $(VEC_SIZE * 4), %rdi
> + addq $(VEC_SIZE * 4), %rsi
> + cmpq %rdi, %rdx
> + ja L(loop_4x)
> + /* Get remaining length in edx. */
> + subl %edi, %edx
> + /* Restore offset so we can reuse L(last_2x_vec). */
> + addl $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
> +#ifdef USE_AS_WMEMCMP
> + shrl $2, %edx
> #endif
> + cmpl $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
> + jbe L(last_2x_vec)
> +
>
> - .p2align 4,, 4
> -L(ATR):
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jae L(mt16)
> - testq $16, %rdi
> - jz L(ATR32)
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> - cmpq %rdi, %r10
> - je L(mt16)
> -
> -L(ATR32):
> - movq %r11, %r10
> - andq $-64, %r10
> - testq $32, %rdi
> - jz L(ATR64)
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> -L(ATR64):
> - cmpq %rdi, %r10
> - je L(mt32)
> -
> -L(ATR64main):
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> - cmpq %rdi, %r10
> - jne L(ATR64main)
> -
> - movq %r11, %r10
> - andq $-32, %r10
> - cmpq %r10, %rdi
> - jae L(mt16)
> -
> -L(ATR32res):
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - movdqa (%rdi,%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> - pmovmskb %xmm0, %edx
> - subl $0xffff, %edx
> - jnz L(neq)
> - addq $16, %rdi
> -
> - cmpq %r10, %rdi
> - jne L(ATR32res)
> -
> - subq %rdi, %r11
> - je L(finz)
> - movq %r11, %r10
> - jmp L(small)
> - /* Align to 16byte to improve instruction fetch. */
> - .p2align 4,, 4
> -END(memcmp)
> + movups (VEC_SIZE * 2)(%rsi), %xmm0
> + movups (VEC_SIZE * 2)(%rdi), %xmm1
> + PCMPEQ %xmm0, %xmm1
> + movups (VEC_SIZE * 3)(%rsi), %xmm2
> + movups (VEC_SIZE * 3)(%rdi), %xmm3
> + PCMPEQ %xmm2, %xmm3
> + pand %xmm1, %xmm3
>
> + pmovmskb %xmm3, %eax
> + CHECK_CMP (%ecx, %eax)
> + jz L(last_2x_vec)
> #ifdef USE_AS_MEMCMPEQ
> -libc_hidden_def (memcmp)
> +L(ret_nonzero_loop):
> + ret
> #else
> -# undef bcmp
> -weak_alias (memcmp, bcmp)
> -libc_hidden_builtin_def (memcmp)
> +
> + .p2align 4
> +L(ret_nonzero_vec_start_2_3):
> + pmovmskb %xmm1, %edx
> + sall $16, %eax
> + leal 1(%rax, %rdx), %eax
> +
> + bsfl %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +
> + .p2align 4
> +L(ret_nonzero_loop):
> + pmovmskb %xmm0, %ecx
> + pmovmskb %xmm1, %edx
> + sall $(VEC_SIZE * 1), %edx
> + leal 1(%rcx, %rdx), %edx
> + pmovmskb %xmm2, %ecx
> + /* High 16 bits of eax guranteed to be all ones. Rotate them in
> + to we can do `or + not` with just `xor`. */
> + rorl $16, %eax
> + xorl %ecx, %eax
> +
> + salq $32, %rax
> + orq %rdx, %rax
> +
> + bsfq %rax, %rax
> +# ifdef USE_AS_WMEMCMP
> + movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
> + xorl %edx, %edx
> + cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
> + /* NB: no partial register stall here because xorl zero idiom
> + above. */
> + setg %dl
> + leal -1(%rdx, %rdx), %eax
> +# else
> + movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
> + movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
> + subl %ecx, %eax
> +# endif
> + ret
> +#endif
> +END(MEMCMP)
> +
> +#ifndef USE_AS_WMEMCMP
> +# ifdef USE_AS_MEMCMPEQ
> +libc_hidden_def (MEMCMP)
> +# else
> +# undef bcmp
> +weak_alias (MEMCMP, bcmp)
> +libc_hidden_builtin_def (MEMCMP)
> +# endif
> #endif
> diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
> index 2cee881fed..80c5e912a6 100644
> --- a/sysdeps/x86_64/memcmpeq.S
> +++ b/sysdeps/x86_64/memcmpeq.S
> @@ -16,6 +16,6 @@
> License along with the GNU C Library; if not, see
> <https://www.gnu.org/licenses/>. */
>
> -#define memcmp __memcmpeq
> +#define MEMCMP __memcmpeq
> #define USE_AS_MEMCMPEQ 1
> #include "multiarch/memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7ea963fc0..b573966966 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -162,8 +162,8 @@ sysdep_routines += \
> wmemchr-sse2 \
> wmemcmp-avx2-movbe \
> wmemcmp-avx2-movbe-rtm \
> - wmemcmp-c \
> wmemcmp-evex-movbe \
> + wmemcmp-sse2 \
> wmemcmp-sse4 \
> # sysdep_routines
> endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
> index e10555638d..4080fc1875 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
> @@ -17,8 +17,8 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# ifndef memcmp
> -# define memcmp __memcmp_sse2
> +# ifndef MEMCMP
> +# define MEMCMP __memcmp_sse2
> # endif
>
> # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> index de7f5a7525..9d991e5c74 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> @@ -17,9 +17,9 @@
> <https://www.gnu.org/licenses/>. */
>
> #if IS_IN (libc)
> -# define memcmp __memcmpeq_sse2
> +# define MEMCMP __memcmpeq_sse2
> #else
> -# define memcmp __memcmpeq
> +# define MEMCMP __memcmpeq
> #endif
> #define USE_AS_MEMCMPEQ 1
> #include "memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
> deleted file mode 100644
> index 46b6715e18..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
> +++ /dev/null
> @@ -1,9 +0,0 @@
> -#if IS_IN (libc)
> -# include <wchar.h>
> -
> -# define WMEMCMP __wmemcmp_sse2
> -
> -extern __typeof (wmemcmp) __wmemcmp_sse2;
> -#endif
> -
> -#include "wcsmbs/wmemcmp.c"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
> new file mode 100644
> index 0000000000..57be1c446e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
> @@ -0,0 +1,25 @@
> +/* wmemcmp optimized with SSE2.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#if IS_IN (libc)
> +# define MEMCMP __wmemcmp_sse2
> +#else
> +# define MEMCMP wmemcmp
> +#endif
> +#define USE_AS_WMEMCMP 1
> +#include "memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
> new file mode 100644
> index 0000000000..032f389158
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemcmp.S
> @@ -0,0 +1,21 @@
> +/* wmemcmp optimized with SSE2.
> + Copyright (C) 2022 Free Software Foundation, Inc.
> + This file is part of the GNU C Library.
> +
> + The GNU C Library is free software; you can redistribute it and/or
> + modify it under the terms of the GNU Lesser General Public
> + License as published by the Free Software Foundation; either
> + version 2.1 of the License, or (at your option) any later version.
> +
> + The GNU C Library is distributed in the hope that it will be useful,
> + but WITHOUT ANY WARRANTY; without even the implied warranty of
> + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> + Lesser General Public License for more details.
> +
> + You should have received a copy of the GNU Lesser General Public
> + License along with the GNU C Library; if not, see
> + <https://www.gnu.org/licenses/>. */
> +
> +#define MEMCMP wmemcmp
> +#define USE_AS_WMEMCMP 1
> +#include "multiarch/memcmp-sse2.S"
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 5/6] x86: Remove memcmp-sse4.S
2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Code didn't actually use any sse4 instructions. The new memcmp-sse2
> implementation is also faster.
>
> geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
>
> Note there are two regressions prefering SSE2 for Size = 1 and Size =
> 65.
>
> Size = 1:
> size, align0, align1, ret, New Time/Old Time
> 1, 1, 1, 0, 1.2
> 1, 1, 1, 1, 1.197
> 1, 1, 1, -1, 1.2
>
> This is intentional. Size == 1 is significantly less hot based on
> profiles of GCC11 and Python3 than sizes [4, 8] (which is made
> hotter).
>
> Python3 Size = 1 -> 13.64%
> Python3 Size = [4, 8] -> 60.92%
>
> GCC11 Size = 1 -> 1.29%
> GCC11 Size = [4, 8] -> 33.86%
>
> size, align0, align1, ret, New Time/Old Time
> 4, 4, 4, 0, 0.622
> 4, 4, 4, 1, 0.797
> 4, 4, 4, -1, 0.805
> 5, 5, 5, 0, 0.623
> 5, 5, 5, 1, 0.777
> 5, 5, 5, -1, 0.802
> 6, 6, 6, 0, 0.625
> 6, 6, 6, 1, 0.813
> 6, 6, 6, -1, 0.788
> 7, 7, 7, 0, 0.625
> 7, 7, 7, 1, 0.799
> 7, 7, 7, -1, 0.795
> 8, 8, 8, 0, 0.625
> 8, 8, 8, 1, 0.848
> 8, 8, 8, -1, 0.914
> 9, 9, 9, 0, 0.625
>
> Size = 65:
> size, align0, align1, ret, New Time/Old Time
> 65, 0, 0, 0, 1.103
> 65, 0, 0, 1, 1.216
> 65, 0, 0, -1, 1.227
> 65, 65, 0, 0, 1.091
> 65, 0, 65, 1, 1.19
> 65, 65, 65, -1, 1.215
>
> This is because A) the checks in range [65, 96] are now unrolled 2x
> and B) because smaller values <= 16 are now given a hotter path. By
> contrast the SSE4 version has a branch for Size = 80. The unrolled
> version has get better performance for returns which need both
> comparisons.
>
> size, align0, align1, ret, New Time/Old Time
> 128, 4, 8, 0, 0.858
> 128, 4, 8, 1, 0.879
> 128, 4, 8, -1, 0.888
>
> As well, out of microbenchmark environments that are not full
> predictable the branch will have a real-cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 --
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ----
> sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 ----
> 3 files changed, 10 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index b573966966..0400ea332b 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -11,7 +11,6 @@ sysdep_routines += \
> memcmp-avx2-movbe-rtm \
> memcmp-evex-movbe \
> memcmp-sse2 \
> - memcmp-sse4 \
> memcmpeq-avx2 \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> @@ -164,7 +163,6 @@ sysdep_routines += \
> wmemcmp-avx2-movbe-rtm \
> wmemcmp-evex-movbe \
> wmemcmp-sse2 \
> - wmemcmp-sse4 \
> # sysdep_routines
> endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c6008a73ed..a8afcf81bb 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (MOVBE)),
> __memcmp_evex_movbe)
> - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> - __memcmp_sse4_1)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> #ifdef SHARED
> @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> && CPU_FEATURE_USABLE (BMI2)
> && CPU_FEATURE_USABLE (MOVBE)),
> __wmemcmp_evex_movbe)
> - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> - __wmemcmp_sse4_1)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/wmemset.c. */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index 44759a3ad5..c743970fe3 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
> @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void)
> return OPTIMIZE (avx2_movbe);
> }
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> - return OPTIMIZE (sse4_1);
> -
> return OPTIMIZE (sse2);
> }
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S
2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
@ 2022-04-10 0:48 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:48 UTC (permalink / raw)
To: GNU C Library
Disregard this patch. It's from the wrong patchset.
On Sat, Apr 9, 2022 at 7:47 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Old code was both inefficient and wasted code size. New code (-62
> bytes) and comparable or better performance in the page cross case.
>
> geometric_mean(N=20) of page cross cases New / Original: 0.960
>
> size, align0, align1, ret, New Time/Old Time
> 1, 4095, 0, 0, 1.001
> 1, 4095, 0, 1, 0.999
> 1, 4095, 0, -1, 1.0
> 2, 4094, 0, 0, 1.0
> 2, 4094, 0, 1, 1.0
> 2, 4094, 0, -1, 1.0
> 3, 4093, 0, 0, 1.0
> 3, 4093, 0, 1, 1.0
> 3, 4093, 0, -1, 1.0
> 4, 4092, 0, 0, 0.987
> 4, 4092, 0, 1, 1.0
> 4, 4092, 0, -1, 1.0
> 5, 4091, 0, 0, 0.984
> 5, 4091, 0, 1, 1.002
> 5, 4091, 0, -1, 1.005
> 6, 4090, 0, 0, 0.993
> 6, 4090, 0, 1, 1.001
> 6, 4090, 0, -1, 1.003
> 7, 4089, 0, 0, 0.991
> 7, 4089, 0, 1, 1.0
> 7, 4089, 0, -1, 1.001
> 8, 4088, 0, 0, 0.875
> 8, 4088, 0, 1, 0.881
> 8, 4088, 0, -1, 0.888
> 9, 4087, 0, 0, 0.872
> 9, 4087, 0, 1, 0.879
> 9, 4087, 0, -1, 0.883
> 10, 4086, 0, 0, 0.878
> 10, 4086, 0, 1, 0.886
> 10, 4086, 0, -1, 0.873
> 11, 4085, 0, 0, 0.878
> 11, 4085, 0, 1, 0.881
> 11, 4085, 0, -1, 0.879
> 12, 4084, 0, 0, 0.873
> 12, 4084, 0, 1, 0.889
> 12, 4084, 0, -1, 0.875
> 13, 4083, 0, 0, 0.873
> 13, 4083, 0, 1, 0.863
> 13, 4083, 0, -1, 0.863
> 14, 4082, 0, 0, 0.838
> 14, 4082, 0, 1, 0.869
> 14, 4082, 0, -1, 0.877
> 15, 4081, 0, 0, 0.841
> 15, 4081, 0, 1, 0.869
> 15, 4081, 0, -1, 0.876
> 16, 4080, 0, 0, 0.988
> 16, 4080, 0, 1, 0.99
> 16, 4080, 0, -1, 0.989
> 17, 4079, 0, 0, 0.978
> 17, 4079, 0, 1, 0.981
> 17, 4079, 0, -1, 0.98
> 18, 4078, 0, 0, 0.981
> 18, 4078, 0, 1, 0.98
> 18, 4078, 0, -1, 0.985
> 19, 4077, 0, 0, 0.977
> 19, 4077, 0, 1, 0.979
> 19, 4077, 0, -1, 0.986
> 20, 4076, 0, 0, 0.977
> 20, 4076, 0, 1, 0.986
> 20, 4076, 0, -1, 0.984
> 21, 4075, 0, 0, 0.977
> 21, 4075, 0, 1, 0.983
> 21, 4075, 0, -1, 0.988
> 22, 4074, 0, 0, 0.983
> 22, 4074, 0, 1, 0.994
> 22, 4074, 0, -1, 0.993
> 23, 4073, 0, 0, 0.98
> 23, 4073, 0, 1, 0.992
> 23, 4073, 0, -1, 0.995
> 24, 4072, 0, 0, 0.989
> 24, 4072, 0, 1, 0.989
> 24, 4072, 0, -1, 0.991
> 25, 4071, 0, 0, 0.99
> 25, 4071, 0, 1, 0.999
> 25, 4071, 0, -1, 0.996
> 26, 4070, 0, 0, 0.993
> 26, 4070, 0, 1, 0.995
> 26, 4070, 0, -1, 0.998
> 27, 4069, 0, 0, 0.993
> 27, 4069, 0, 1, 0.999
> 27, 4069, 0, -1, 1.0
> 28, 4068, 0, 0, 0.997
> 28, 4068, 0, 1, 1.0
> 28, 4068, 0, -1, 0.999
> 29, 4067, 0, 0, 0.996
> 29, 4067, 0, 1, 0.999
> 29, 4067, 0, -1, 0.999
> 30, 4066, 0, 0, 0.991
> 30, 4066, 0, 1, 1.001
> 30, 4066, 0, -1, 0.999
> 31, 4065, 0, 0, 0.988
> 31, 4065, 0, 1, 0.998
> 31, 4065, 0, -1, 0.998
> ---
> sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
> 1 file changed, 61 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> index a34ea1645d..210c9925b6 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> @@ -429,22 +429,21 @@ L(page_cross_less_vec):
> # ifndef USE_AS_WMEMCMP
> cmpl $8, %edx
> jae L(between_8_15)
> + /* Fall through for [4, 7]. */
> cmpl $4, %edx
> - jae L(between_4_7)
> + jb L(between_2_3)
>
> - /* Load as big endian to avoid branches. */
> - movzwl (%rdi), %eax
> - movzwl (%rsi), %ecx
> - shll $8, %eax
> - shll $8, %ecx
> - bswap %eax
> - bswap %ecx
> - movzbl -1(%rdi, %rdx), %edi
> - movzbl -1(%rsi, %rdx), %esi
> - orl %edi, %eax
> - orl %esi, %ecx
> - /* Subtraction is okay because the upper 8 bits are zero. */
> - subl %ecx, %eax
> + movbe (%rdi), %eax
> + movbe (%rsi), %ecx
> + shlq $32, %rax
> + shlq $32, %rcx
> + movbe -4(%rdi, %rdx), %edi
> + movbe -4(%rsi, %rdx), %esi
> + orq %rdi, %rax
> + orq %rsi, %rcx
> + subq %rcx, %rax
> + /* Fast path for return zero. */
> + jnz L(ret_nonzero)
> /* No ymm register was touched. */
> ret
>
> @@ -457,9 +456,33 @@ L(one_or_less):
> /* No ymm register was touched. */
> ret
>
> + .p2align 4,, 5
> +L(ret_nonzero):
> + sbbl %eax, %eax
> + orl $1, %eax
> + /* No ymm register was touched. */
> + ret
> +
> + .p2align 4,, 2
> +L(zero):
> + xorl %eax, %eax
> + /* No ymm register was touched. */
> + ret
> +
> .p2align 4
> L(between_8_15):
> -# endif
> + movbe (%rdi), %rax
> + movbe (%rsi), %rcx
> + subq %rcx, %rax
> + jnz L(ret_nonzero)
> + movbe -8(%rdi, %rdx), %rax
> + movbe -8(%rsi, %rdx), %rcx
> + subq %rcx, %rax
> + /* Fast path for return zero. */
> + jnz L(ret_nonzero)
> + /* No ymm register was touched. */
> + ret
> +# else
> /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
> vmovq (%rdi), %xmm1
> vmovq (%rsi), %xmm2
> @@ -475,16 +498,13 @@ L(between_8_15):
> VPCMPEQ %xmm1, %xmm2, %xmm2
> vpmovmskb %xmm2, %eax
> subl $0xffff, %eax
> + /* Fast path for return zero. */
> jnz L(return_vec_0)
> /* No ymm register was touched. */
> ret
> +# endif
>
> - .p2align 4
> -L(zero):
> - xorl %eax, %eax
> - ret
> -
> - .p2align 4
> + .p2align 4,, 10
> L(between_16_31):
> /* From 16 to 31 bytes. No branch when size == 16. */
> vmovdqu (%rsi), %xmm2
> @@ -501,11 +521,17 @@ L(between_16_31):
> VPCMPEQ (%rdi), %xmm2, %xmm2
> vpmovmskb %xmm2, %eax
> subl $0xffff, %eax
> + /* Fast path for return zero. */
> jnz L(return_vec_0)
> /* No ymm register was touched. */
> ret
>
> # ifdef USE_AS_WMEMCMP
> + .p2align 4,, 2
> +L(zero):
> + xorl %eax, %eax
> + ret
> +
> .p2align 4
> L(one_or_less):
> jb L(zero)
> @@ -520,22 +546,20 @@ L(one_or_less):
> # else
>
> .p2align 4
> -L(between_4_7):
> - /* Load as big endian with overlapping movbe to avoid branches.
> - */
> - movbe (%rdi), %eax
> - movbe (%rsi), %ecx
> - shlq $32, %rax
> - shlq $32, %rcx
> - movbe -4(%rdi, %rdx), %edi
> - movbe -4(%rsi, %rdx), %esi
> - orq %rdi, %rax
> - orq %rsi, %rcx
> - subq %rcx, %rax
> - jz L(zero_4_7)
> - sbbl %eax, %eax
> - orl $1, %eax
> -L(zero_4_7):
> +L(between_2_3):
> + /* Load as big endian to avoid branches. */
> + movzwl (%rdi), %eax
> + movzwl (%rsi), %ecx
> + bswap %eax
> + bswap %ecx
> + shrl %eax
> + shrl %ecx
> + movzbl -1(%rdi, %rdx), %edi
> + movzbl -1(%rsi, %rdx), %esi
> + orl %edi, %eax
> + orl %esi, %ecx
> + /* Subtraction is okay because the upper bit is zero. */
> + subl %ecx, %eax
> /* No ymm register was touched. */
> ret
> # endif
> --
> 2.25.1
>
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (7 preceding siblings ...)
2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
@ 2022-04-10 0:54 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (3 more replies)
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
9 siblings, 4 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
5 files changed, 2006 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
memcmp-evex-movbe \
memcmp-sse2 \
memcmp-sse4 \
- memcmp-ssse3 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
wmemcmp-c \
wmemcmp-evex-movbe \
wmemcmp-sse4 \
- wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
- __memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
- __wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
- test %RDX_LP, %RDX_LP
- jz L(equal)
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-10 0:54 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
` (2 subsequent siblings)
3 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
sysdeps/x86_64/multiarch/strcmp.c | 4 -
sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
sysdeps/x86_64/multiarch/strncmp.c | 4 -
sysdeps/x86_64/strcmp.S | 155 ++++--------------
10 files changed, 30 insertions(+), 202 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
- strcasecmp_l-ssse3 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
- strcmp-ssse3 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
- strncase_l-ssse3 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
- strncmp-ssse3 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
__strcasecmp_l_sse2))
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strcmp_evex)
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
__strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
- __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
__strncasecmp_sse2))
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
__strncasecmp_l_sse2))
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncmp_evex)
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
__strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
- __strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
# endif
#endif
-#ifndef USE_SSSE3
.text
-#else
- .section .text.ssse3,"ax",@progbits
-#endif
-
#ifdef USE_AS_STRCASECMP_L
# ifndef ENTRY2
# define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 3/6] x86: Remove str{n}cat-ssse3
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-10 0:54 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
3 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
5 files changed, 879 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..2b3c625ea2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -63,7 +63,6 @@ sysdep_routines += \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
- strcat-ssse3 \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
@@ -101,7 +100,6 @@ sysdep_routines += \
strncat-c \
strncat-evex \
strncat-sse2-unaligned \
- strncat-ssse3 \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..41a04621ad 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcat_evex)
- IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
- __strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
@@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncat_evex)
- IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
- __strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-10 0:54 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
3 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
6 files changed, 3572 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (2 preceding siblings ...)
2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-10 0:54 ` Noah Goldstein
3 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:54 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +-
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
5 files changed, 6 insertions(+), 3212 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
memcmpeq-evex \
memcmpeq-sse2 \
memcpy-ssse3 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ return OPTIMIZE (ssse3);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-04-10 0:57 ` Noah Goldstein
0 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-10 0:57 UTC (permalink / raw)
To: libc-alpha
The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.
This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.
Aside from this function all other SSSE3 functions should be safe to
remove.
The performance is not changed drastically although shows overall
improvements without any major regressions or gains.
bench-memcpy geometric_mean(N=50) New / Original: 0.962
bench-memcpy-random geometric_mean(N=50) New / Original: 0.895
bench-memcpy-large geometric_mean(N=50) New / Original: 0.894
Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.
More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
memcpy benchmarks comparisong memcpy-ssse3 before / after this
patch. Results are from geomtric mean of N=50 runs on Zhaoxin
KX-6840@2000MHz.
bench-memcpy:
length, align1, align2, dst > src, New Time / Old Time
1, 0, 0, 0, 2.099
1, 0, 0, 1, 2.099
1, 32, 0, 0, 2.103
1, 32, 0, 1, 2.103
1, 0, 32, 0, 2.099
1, 0, 32, 1, 2.098
1, 32, 32, 0, 2.098
1, 32, 32, 1, 2.098
1, 2048, 0, 0, 2.098
1, 2048, 0, 1, 2.098
2, 0, 0, 0, 1.135
2, 0, 0, 1, 1.136
2, 1, 0, 0, 1.139
2, 1, 0, 1, 1.139
2, 33, 0, 0, 1.165
2, 33, 0, 1, 1.139
2, 0, 1, 0, 1.136
2, 0, 1, 1, 1.136
2, 0, 33, 0, 1.136
2, 0, 33, 1, 1.136
2, 1, 1, 0, 1.136
2, 1, 1, 1, 1.136
2, 33, 33, 0, 1.136
2, 33, 33, 1, 1.136
2, 2048, 0, 0, 1.136
2, 2048, 0, 1, 1.136
2, 2049, 0, 0, 1.191
2, 2049, 0, 1, 1.139
2, 2048, 1, 0, 1.136
2, 2048, 1, 1, 1.136
2, 2049, 1, 0, 1.136
2, 2049, 1, 1, 1.136
4, 0, 0, 0, 1.074
4, 0, 0, 1, 0.962
4, 2, 0, 0, 0.973
4, 2, 0, 1, 0.989
4, 34, 0, 0, 0.991
4, 34, 0, 1, 0.991
4, 0, 2, 0, 0.962
4, 0, 2, 1, 0.962
4, 0, 34, 0, 0.962
4, 0, 34, 1, 0.962
4, 2, 2, 0, 0.962
4, 2, 2, 1, 0.962
4, 34, 34, 0, 0.962
4, 34, 34, 1, 0.962
4, 2048, 0, 0, 0.962
4, 2048, 0, 1, 0.962
4, 2050, 0, 0, 0.977
4, 2050, 0, 1, 0.979
4, 2048, 2, 0, 0.962
4, 2048, 2, 1, 0.962
4, 2050, 2, 0, 0.962
4, 2050, 2, 1, 0.962
8, 0, 0, 0, 0.961
8, 0, 0, 1, 0.962
8, 3, 0, 0, 1.0
8, 3, 0, 1, 1.0
8, 35, 0, 0, 1.0
8, 35, 0, 1, 1.0
8, 0, 3, 0, 0.962
8, 0, 3, 1, 0.962
8, 0, 35, 0, 0.962
8, 0, 35, 1, 0.962
8, 3, 3, 0, 0.962
8, 3, 3, 1, 0.962
8, 35, 35, 0, 0.962
8, 35, 35, 1, 0.962
8, 2048, 0, 0, 0.962
8, 2048, 0, 1, 0.962
8, 2051, 0, 0, 1.0
8, 2051, 0, 1, 1.0
8, 2048, 3, 0, 0.962
8, 2048, 3, 1, 0.962
8, 2051, 3, 0, 0.962
8, 2051, 3, 1, 0.962
16, 0, 0, 0, 0.798
16, 0, 0, 1, 0.799
16, 4, 0, 0, 0.8
16, 4, 0, 1, 0.801
16, 36, 0, 0, 0.801
16, 36, 0, 1, 0.8
16, 0, 4, 0, 0.798
16, 0, 4, 1, 0.798
16, 0, 36, 0, 0.798
16, 0, 36, 1, 0.798
16, 4, 4, 0, 0.798
16, 4, 4, 1, 0.798
16, 36, 36, 0, 0.798
16, 36, 36, 1, 0.798
16, 2048, 0, 0, 0.798
16, 2048, 0, 1, 0.799
16, 2052, 0, 0, 0.8
16, 2052, 0, 1, 0.8
16, 2048, 4, 0, 0.798
16, 2048, 4, 1, 0.798
16, 2052, 4, 0, 0.798
16, 2052, 4, 1, 0.798
32, 0, 0, 0, 0.471
32, 0, 0, 1, 0.471
32, 5, 0, 0, 0.471
32, 5, 0, 1, 0.471
32, 37, 0, 0, 0.961
32, 37, 0, 1, 0.961
32, 0, 5, 0, 0.471
32, 0, 5, 1, 0.471
32, 0, 37, 0, 1.021
32, 0, 37, 1, 1.021
32, 5, 5, 0, 0.471
32, 5, 5, 1, 0.471
32, 37, 37, 0, 1.011
32, 37, 37, 1, 1.011
32, 2048, 0, 0, 0.471
32, 2048, 0, 1, 0.471
32, 2053, 0, 0, 0.471
32, 2053, 0, 1, 0.471
32, 2048, 5, 0, 0.471
32, 2048, 5, 1, 0.471
32, 2053, 5, 0, 0.471
32, 2053, 5, 1, 0.471
64, 0, 0, 0, 1.0
64, 0, 0, 1, 1.0
64, 6, 0, 0, 0.862
64, 6, 0, 1, 0.862
64, 38, 0, 0, 0.912
64, 38, 0, 1, 0.912
64, 0, 6, 0, 0.896
64, 0, 6, 1, 0.896
64, 0, 38, 0, 0.906
64, 0, 38, 1, 0.906
64, 6, 6, 0, 0.91
64, 6, 6, 1, 0.91
64, 38, 38, 0, 0.883
64, 38, 38, 1, 0.883
64, 2048, 0, 0, 1.0
64, 2048, 0, 1, 1.0
64, 2054, 0, 0, 0.862
64, 2054, 0, 1, 0.862
64, 2048, 6, 0, 0.887
64, 2048, 6, 1, 0.887
64, 2054, 6, 0, 0.887
64, 2054, 6, 1, 0.887
128, 0, 0, 0, 0.857
128, 0, 0, 1, 0.857
128, 7, 0, 0, 0.875
128, 7, 0, 1, 0.875
128, 39, 0, 0, 0.892
128, 39, 0, 1, 0.892
128, 0, 7, 0, 1.183
128, 0, 7, 1, 1.183
128, 0, 39, 0, 1.113
128, 0, 39, 1, 1.113
128, 7, 7, 0, 0.692
128, 7, 7, 1, 0.692
128, 39, 39, 0, 1.104
128, 39, 39, 1, 1.104
128, 2048, 0, 0, 0.857
128, 2048, 0, 1, 0.857
128, 2055, 0, 0, 0.875
128, 2055, 0, 1, 0.875
128, 2048, 7, 0, 0.959
128, 2048, 7, 1, 0.959
128, 2055, 7, 0, 1.036
128, 2055, 7, 1, 1.036
256, 0, 0, 0, 0.889
256, 0, 0, 1, 0.889
256, 8, 0, 0, 0.966
256, 8, 0, 1, 0.966
256, 40, 0, 0, 0.983
256, 40, 0, 1, 0.983
256, 0, 8, 0, 1.29
256, 0, 8, 1, 1.29
256, 0, 40, 0, 1.274
256, 0, 40, 1, 1.274
256, 8, 8, 0, 0.865
256, 8, 8, 1, 0.865
256, 40, 40, 0, 1.477
256, 40, 40, 1, 1.477
256, 2048, 0, 0, 0.889
256, 2048, 0, 1, 0.889
256, 2056, 0, 0, 0.966
256, 2056, 0, 1, 0.966
256, 2048, 8, 0, 0.952
256, 2048, 8, 1, 0.952
256, 2056, 8, 0, 0.878
256, 2056, 8, 1, 0.878
512, 0, 0, 0, 1.077
512, 0, 0, 1, 1.077
512, 9, 0, 0, 1.001
512, 9, 0, 1, 1.0
512, 41, 0, 0, 0.954
512, 41, 0, 1, 0.954
512, 0, 9, 0, 1.191
512, 0, 9, 1, 1.191
512, 0, 41, 0, 1.181
512, 0, 41, 1, 1.181
512, 9, 9, 0, 0.765
512, 9, 9, 1, 0.765
512, 41, 41, 0, 0.905
512, 41, 41, 1, 0.905
512, 2048, 0, 0, 1.077
512, 2048, 0, 1, 1.077
512, 2057, 0, 0, 1.0
512, 2057, 0, 1, 1.0
512, 2048, 9, 0, 1.0
512, 2048, 9, 1, 1.0
512, 2057, 9, 0, 0.733
512, 2057, 9, 1, 0.733
1024, 0, 0, 0, 1.143
1024, 0, 0, 1, 1.143
1024, 10, 0, 0, 1.015
1024, 10, 0, 1, 1.015
1024, 42, 0, 0, 1.045
1024, 42, 0, 1, 1.045
1024, 0, 10, 0, 1.126
1024, 0, 10, 1, 1.126
1024, 0, 42, 0, 1.114
1024, 0, 42, 1, 1.114
1024, 10, 10, 0, 0.89
1024, 10, 10, 1, 0.89
1024, 42, 42, 0, 0.986
1024, 42, 42, 1, 0.986
1024, 2048, 0, 0, 1.143
1024, 2048, 0, 1, 1.143
1024, 2058, 0, 0, 1.015
1024, 2058, 0, 1, 1.015
1024, 2048, 10, 0, 1.03
1024, 2048, 10, 1, 1.03
1024, 2058, 10, 0, 0.854
1024, 2058, 10, 1, 0.854
2048, 0, 0, 0, 1.005
2048, 0, 0, 1, 1.005
2048, 11, 0, 0, 1.013
2048, 11, 0, 1, 1.014
2048, 43, 0, 0, 1.044
2048, 43, 0, 1, 1.044
2048, 0, 11, 0, 1.003
2048, 0, 11, 1, 1.003
2048, 0, 43, 0, 1.003
2048, 0, 43, 1, 1.003
2048, 11, 11, 0, 0.92
2048, 11, 11, 1, 0.92
2048, 43, 43, 0, 1.0
2048, 43, 43, 1, 1.0
2048, 2048, 0, 0, 1.005
2048, 2048, 0, 1, 1.005
2048, 2059, 0, 0, 0.904
2048, 2059, 0, 1, 0.904
2048, 2048, 11, 0, 1.0
2048, 2048, 11, 1, 1.0
2048, 2059, 11, 0, 0.979
2048, 2059, 11, 1, 0.979
4096, 0, 0, 0, 1.014
4096, 0, 0, 1, 1.014
4096, 12, 0, 0, 0.855
4096, 12, 0, 1, 0.855
4096, 44, 0, 0, 0.857
4096, 44, 0, 1, 0.857
4096, 0, 12, 0, 0.932
4096, 0, 12, 1, 0.932
4096, 0, 44, 0, 0.932
4096, 0, 44, 1, 0.932
4096, 12, 12, 0, 0.999
4096, 12, 12, 1, 0.999
4096, 44, 44, 0, 1.051
4096, 44, 44, 1, 1.051
4096, 2048, 0, 0, 1.014
4096, 2048, 0, 1, 1.014
4096, 2060, 0, 0, 0.98
4096, 2060, 0, 1, 0.98
4096, 2048, 12, 0, 0.77
4096, 2048, 12, 1, 0.77
4096, 2060, 12, 0, 0.943
4096, 2060, 12, 1, 0.943
8192, 0, 0, 0, 1.046
8192, 0, 0, 1, 1.046
8192, 13, 0, 0, 0.885
8192, 13, 0, 1, 0.885
8192, 45, 0, 0, 0.887
8192, 45, 0, 1, 0.886
8192, 0, 13, 0, 0.942
8192, 0, 13, 1, 0.942
8192, 0, 45, 0, 0.942
8192, 0, 45, 1, 0.942
8192, 13, 13, 0, 1.03
8192, 13, 13, 1, 1.03
8192, 45, 45, 0, 1.048
8192, 45, 45, 1, 1.048
8192, 2048, 0, 0, 1.048
8192, 2048, 0, 1, 1.048
8192, 2061, 0, 0, 1.011
8192, 2061, 0, 1, 1.011
8192, 2048, 13, 0, 0.789
8192, 2048, 13, 1, 0.789
8192, 2061, 13, 0, 0.991
8192, 2061, 13, 1, 0.991
16384, 0, 0, 0, 1.014
16384, 0, 0, 1, 1.008
16384, 14, 0, 0, 0.951
16384, 14, 0, 1, 0.95
16384, 46, 0, 0, 0.874
16384, 46, 0, 1, 0.871
16384, 0, 14, 0, 0.813
16384, 0, 14, 1, 0.81
16384, 0, 46, 0, 0.85
16384, 0, 46, 1, 0.86
16384, 14, 14, 0, 0.985
16384, 14, 14, 1, 0.975
16384, 46, 46, 0, 1.025
16384, 46, 46, 1, 1.027
16384, 2048, 0, 0, 1.058
16384, 2048, 0, 1, 1.058
16384, 2062, 0, 0, 0.849
16384, 2062, 0, 1, 0.848
16384, 2048, 14, 0, 0.907
16384, 2048, 14, 1, 0.907
16384, 2062, 14, 0, 0.988
16384, 2062, 14, 1, 0.995
32768, 0, 0, 0, 0.979
32768, 0, 0, 1, 0.979
32768, 15, 0, 0, 1.006
32768, 15, 0, 1, 1.006
32768, 47, 0, 0, 1.004
32768, 47, 0, 1, 1.004
32768, 0, 15, 0, 1.045
32768, 0, 15, 1, 1.045
32768, 0, 47, 0, 1.011
32768, 0, 47, 1, 1.012
32768, 15, 15, 0, 0.977
32768, 15, 15, 1, 0.977
32768, 47, 47, 0, 0.96
32768, 47, 47, 1, 0.96
32768, 2048, 0, 0, 0.978
32768, 2048, 0, 1, 0.978
32768, 2063, 0, 0, 1.004
32768, 2063, 0, 1, 1.004
32768, 2048, 15, 0, 1.036
32768, 2048, 15, 1, 1.036
32768, 2063, 15, 0, 0.978
32768, 2063, 15, 1, 0.978
65536, 0, 0, 0, 0.981
65536, 0, 0, 1, 0.981
65536, 16, 0, 0, 0.987
65536, 16, 0, 1, 0.987
65536, 48, 0, 0, 0.968
65536, 48, 0, 1, 0.968
65536, 0, 16, 0, 1.014
65536, 0, 16, 1, 1.014
65536, 0, 48, 0, 0.984
65536, 0, 48, 1, 0.984
65536, 16, 16, 0, 1.01
65536, 16, 16, 1, 1.01
65536, 48, 48, 0, 0.968
65536, 48, 48, 1, 0.968
65536, 2048, 0, 0, 0.982
65536, 2048, 0, 1, 0.982
65536, 2064, 0, 0, 0.987
65536, 2064, 0, 1, 0.987
65536, 2048, 16, 0, 1.012
65536, 2048, 16, 1, 1.012
65536, 2064, 16, 0, 1.007
65536, 2064, 16, 1, 1.007
0, 0, 0, 0, 2.104
0, 2048, 0, 0, 2.104
0, 4095, 0, 0, 2.109
0, 0, 4095, 0, 2.103
1, 1, 0, 0, 2.104
1, 0, 1, 0, 2.098
1, 1, 1, 0, 2.098
1, 2049, 0, 0, 2.102
1, 2048, 1, 0, 2.098
1, 2049, 1, 0, 2.098
1, 4095, 0, 0, 2.103
1, 0, 4095, 0, 2.098
2, 2, 0, 0, 1.139
2, 0, 2, 0, 1.136
2, 2, 2, 0, 1.136
2, 2050, 0, 0, 1.139
2, 2048, 2, 0, 1.136
2, 2050, 2, 0, 1.136
2, 4095, 0, 0, 1.0
2, 0, 4095, 0, 1.022
3, 0, 0, 0, 0.981
3, 3, 0, 0, 0.984
3, 0, 3, 0, 0.982
3, 3, 3, 0, 0.982
3, 2048, 0, 0, 0.982
3, 2051, 0, 0, 0.983
3, 2048, 3, 0, 0.982
3, 2051, 3, 0, 0.982
3, 4095, 0, 0, 0.285
3, 0, 4095, 0, 0.231
4, 4, 0, 0, 1.373
4, 0, 4, 0, 1.31
4, 4, 4, 0, 1.282
4, 2052, 0, 0, 1.264
4, 2048, 4, 0, 1.254
4, 2052, 4, 0, 1.254
4, 4095, 0, 0, 1.971
4, 0, 4095, 0, 1.994
5, 0, 0, 0, 1.145
5, 5, 0, 0, 1.155
5, 0, 5, 0, 1.171
5, 5, 5, 0, 1.171
5, 2048, 0, 0, 1.197
5, 2053, 0, 0, 1.173
5, 2048, 5, 0, 1.171
5, 2053, 5, 0, 1.171
5, 4095, 0, 0, 0.935
5, 0, 4095, 0, 1.017
6, 0, 0, 0, 1.145
6, 6, 0, 0, 1.098
6, 0, 6, 0, 1.096
6, 6, 6, 0, 1.096
6, 2048, 0, 0, 1.12
6, 2054, 0, 0, 1.122
6, 2048, 6, 0, 1.12
6, 2054, 6, 0, 1.096
6, 4095, 0, 0, 0.935
6, 0, 4095, 0, 1.018
7, 0, 0, 0, 1.071
7, 7, 0, 0, 1.074
7, 0, 7, 0, 1.072
7, 7, 7, 0, 1.072
7, 2048, 0, 0, 1.096
7, 2055, 0, 0, 1.098
7, 2048, 7, 0, 1.096
7, 2055, 7, 0, 1.096
7, 4095, 0, 0, 0.935
7, 0, 4095, 0, 1.016
8, 8, 0, 0, 1.167
8, 0, 8, 0, 1.028
8, 8, 8, 0, 1.028
8, 2056, 0, 0, 1.069
8, 2048, 8, 0, 1.028
8, 2056, 8, 0, 1.028
8, 4095, 0, 0, 1.029
8, 0, 4095, 0, 1.043
9, 0, 0, 0, 0.799
9, 9, 0, 0, 0.801
9, 0, 9, 0, 0.799
9, 9, 9, 0, 0.799
9, 2048, 0, 0, 0.8
9, 2057, 0, 0, 0.801
9, 2048, 9, 0, 0.8
9, 2057, 9, 0, 0.799
9, 4095, 0, 0, 0.909
9, 0, 4095, 0, 1.0
10, 0, 0, 0, 0.799
10, 10, 0, 0, 0.801
10, 0, 10, 0, 0.8
10, 10, 10, 0, 0.8
10, 2048, 0, 0, 0.8
10, 2058, 0, 0, 0.801
10, 2048, 10, 0, 0.8
10, 2058, 10, 0, 0.8
10, 4095, 0, 0, 0.909
10, 0, 4095, 0, 1.0
11, 0, 0, 0, 0.799
11, 11, 0, 0, 0.801
11, 0, 11, 0, 0.8
11, 11, 11, 0, 0.8
11, 2048, 0, 0, 0.8
11, 2059, 0, 0, 0.802
11, 2048, 11, 0, 0.8
11, 2059, 11, 0, 0.8
11, 4095, 0, 0, 0.909
11, 0, 4095, 0, 1.0
12, 0, 0, 0, 0.799
12, 12, 0, 0, 0.801
12, 0, 12, 0, 0.8
12, 12, 12, 0, 0.8
12, 2048, 0, 0, 0.8
12, 2060, 0, 0, 0.802
12, 2048, 12, 0, 0.8
12, 2060, 12, 0, 0.8
12, 4095, 0, 0, 0.909
12, 0, 4095, 0, 1.0
13, 0, 0, 0, 0.798
13, 13, 0, 0, 0.801
13, 0, 13, 0, 0.799
13, 13, 13, 0, 0.799
13, 2048, 0, 0, 0.8
13, 2061, 0, 0, 0.801
13, 2048, 13, 0, 0.8
13, 2061, 13, 0, 0.8
13, 4095, 0, 0, 0.909
13, 0, 4095, 0, 1.0
14, 0, 0, 0, 0.799
14, 14, 0, 0, 0.801
14, 0, 14, 0, 0.8
14, 14, 14, 0, 0.8
14, 2048, 0, 0, 0.8
14, 2062, 0, 0, 0.801
14, 2048, 14, 0, 0.8
14, 2062, 14, 0, 0.8
14, 4095, 0, 0, 0.909
14, 0, 4095, 0, 1.0
15, 0, 0, 0, 0.799
15, 15, 0, 0, 0.801
15, 0, 15, 0, 0.8
15, 15, 15, 0, 0.8
15, 2048, 0, 0, 0.8
15, 2063, 0, 0, 0.802
15, 2048, 15, 0, 0.8
15, 2063, 15, 0, 0.8
15, 4095, 0, 0, 0.909
15, 0, 4095, 0, 1.0
16, 16, 0, 0, 0.801
16, 0, 16, 0, 0.799
16, 16, 16, 0, 0.799
16, 2064, 0, 0, 0.801
16, 2048, 16, 0, 0.798
16, 2064, 16, 0, 0.798
16, 4095, 0, 0, 1.818
16, 0, 4095, 0, 1.957
17, 0, 0, 0, 0.798
17, 17, 0, 0, 0.8
17, 0, 17, 0, 0.799
17, 17, 17, 0, 0.798
17, 2048, 0, 0, 0.798
17, 2065, 0, 0, 0.8
17, 2048, 17, 0, 0.798
17, 2065, 17, 0, 0.799
17, 4095, 0, 0, 0.937
17, 0, 4095, 0, 1.021
18, 0, 0, 0, 0.798
18, 18, 0, 0, 0.801
18, 0, 18, 0, 0.798
18, 18, 18, 0, 0.798
18, 2048, 0, 0, 0.799
18, 2066, 0, 0, 0.8
18, 2048, 18, 0, 0.798
18, 2066, 18, 0, 0.798
18, 4095, 0, 0, 0.937
18, 0, 4095, 0, 1.021
19, 0, 0, 0, 0.798
19, 19, 0, 0, 0.8
19, 0, 19, 0, 0.798
19, 19, 19, 0, 0.798
19, 2048, 0, 0, 0.798
19, 2067, 0, 0, 0.8
19, 2048, 19, 0, 0.798
19, 2067, 19, 0, 0.798
19, 4095, 0, 0, 0.937
19, 0, 4095, 0, 1.021
20, 0, 0, 0, 0.798
20, 20, 0, 0, 0.8
20, 0, 20, 0, 0.798
20, 20, 20, 0, 0.798
20, 2048, 0, 0, 0.798
20, 2068, 0, 0, 0.8
20, 2048, 20, 0, 0.798
20, 2068, 20, 0, 0.798
20, 4095, 0, 0, 0.937
20, 0, 4095, 0, 1.021
21, 0, 0, 0, 0.798
21, 21, 0, 0, 0.801
21, 0, 21, 0, 0.798
21, 21, 21, 0, 0.798
21, 2048, 0, 0, 0.798
21, 2069, 0, 0, 0.801
21, 2048, 21, 0, 0.799
21, 2069, 21, 0, 0.798
21, 4095, 0, 0, 0.937
21, 0, 4095, 0, 1.021
22, 0, 0, 0, 0.798
22, 22, 0, 0, 0.801
22, 0, 22, 0, 0.798
22, 22, 22, 0, 0.798
22, 2048, 0, 0, 0.798
22, 2070, 0, 0, 0.801
22, 2048, 22, 0, 0.798
22, 2070, 22, 0, 0.798
22, 4095, 0, 0, 0.937
22, 0, 4095, 0, 1.021
23, 0, 0, 0, 0.798
23, 23, 0, 0, 0.8
23, 0, 23, 0, 0.798
23, 23, 23, 0, 0.798
23, 2048, 0, 0, 0.798
23, 2071, 0, 0, 0.8
23, 2048, 23, 0, 0.798
23, 2071, 23, 0, 0.798
23, 4095, 0, 0, 0.937
23, 0, 4095, 0, 1.021
24, 0, 0, 0, 0.798
24, 24, 0, 0, 0.8
24, 0, 24, 0, 0.799
24, 24, 24, 0, 0.798
24, 2048, 0, 0, 0.798
24, 2072, 0, 0, 0.801
24, 2048, 24, 0, 0.798
24, 2072, 24, 0, 0.798
24, 4095, 0, 0, 0.937
24, 0, 4095, 0, 1.021
25, 0, 0, 0, 0.5
25, 25, 0, 0, 0.5
25, 0, 25, 0, 0.5
25, 25, 25, 0, 0.5
25, 2048, 0, 0, 0.5
25, 2073, 0, 0, 0.501
25, 2048, 25, 0, 0.5
25, 2073, 25, 0, 0.5
25, 4095, 0, 0, 0.974
25, 0, 4095, 0, 0.98
26, 0, 0, 0, 0.5
26, 26, 0, 0, 0.501
26, 0, 26, 0, 0.5
26, 26, 26, 0, 0.501
26, 2048, 0, 0, 0.5
26, 2074, 0, 0, 0.5
26, 2048, 26, 0, 0.5
26, 2074, 26, 0, 0.5
26, 4095, 0, 0, 0.974
26, 0, 4095, 0, 1.0
27, 0, 0, 0, 0.5
27, 27, 0, 0, 0.501
27, 0, 27, 0, 0.5
27, 27, 27, 0, 0.5
27, 2048, 0, 0, 0.5
27, 2075, 0, 0, 0.5
27, 2048, 27, 0, 0.5
27, 2075, 27, 0, 0.5
27, 4095, 0, 0, 0.974
27, 0, 4095, 0, 1.0
28, 0, 0, 0, 0.5
28, 28, 0, 0, 0.501
28, 0, 28, 0, 0.5
28, 28, 28, 0, 0.5
28, 2048, 0, 0, 0.5
28, 2076, 0, 0, 0.5
28, 2048, 28, 0, 0.5
28, 2076, 28, 0, 0.5
28, 4095, 0, 0, 0.974
28, 0, 4095, 0, 1.0
29, 0, 0, 0, 0.471
29, 29, 0, 0, 0.471
29, 0, 29, 0, 0.471
29, 29, 29, 0, 0.471
29, 2048, 0, 0, 0.471
29, 2077, 0, 0, 0.471
29, 2048, 29, 0, 0.471
29, 2077, 29, 0, 0.471
29, 4095, 0, 0, 0.974
29, 0, 4095, 0, 1.0
30, 0, 0, 0, 0.471
30, 30, 0, 0, 0.471
30, 0, 30, 0, 0.471
30, 30, 30, 0, 0.471
30, 2048, 0, 0, 0.471
30, 2078, 0, 0, 0.471
30, 2048, 30, 0, 0.471
30, 2078, 30, 0, 0.471
30, 4095, 0, 0, 0.974
30, 0, 4095, 0, 1.0
31, 0, 0, 0, 0.471
31, 31, 0, 0, 0.471
31, 0, 31, 0, 0.471
31, 31, 31, 0, 0.471
31, 2048, 0, 0, 0.471
31, 2079, 0, 0, 0.471
31, 2048, 31, 0, 0.471
31, 2079, 31, 0, 0.471
31, 4095, 0, 0, 0.974
31, 0, 4095, 0, 1.0
48, 0, 0, 0, 1.0
48, 0, 0, 1, 1.0
48, 3, 0, 0, 1.0
48, 3, 0, 1, 1.0
48, 0, 3, 0, 1.0
48, 0, 3, 1, 1.0
48, 3, 3, 0, 1.0
48, 3, 3, 1, 1.0
48, 2048, 0, 0, 1.0
48, 2048, 0, 1, 1.0
48, 2051, 0, 0, 1.0
48, 2051, 0, 1, 1.0
48, 2048, 3, 0, 1.0
48, 2048, 3, 1, 1.0
48, 2051, 3, 0, 1.0
48, 2051, 3, 1, 1.0
80, 0, 0, 0, 0.781
80, 0, 0, 1, 0.782
80, 5, 0, 0, 0.976
80, 5, 0, 1, 0.976
80, 0, 5, 0, 1.232
80, 0, 5, 1, 1.232
80, 5, 5, 0, 1.542
80, 5, 5, 1, 1.543
80, 2048, 0, 0, 0.781
80, 2048, 0, 1, 0.782
80, 2053, 0, 0, 0.976
80, 2053, 0, 1, 0.976
80, 2048, 5, 0, 1.093
80, 2048, 5, 1, 1.093
80, 2053, 5, 0, 1.371
80, 2053, 5, 1, 1.371
96, 0, 0, 0, 0.758
96, 0, 0, 1, 0.758
96, 6, 0, 0, 0.929
96, 6, 0, 1, 0.929
96, 0, 6, 0, 1.204
96, 0, 6, 1, 1.204
96, 6, 6, 0, 1.562
96, 6, 6, 1, 1.562
96, 2048, 0, 0, 0.758
96, 2048, 0, 1, 0.758
96, 2054, 0, 0, 0.929
96, 2054, 0, 1, 0.929
96, 2048, 6, 0, 1.068
96, 2048, 6, 1, 1.068
96, 2054, 6, 0, 1.562
96, 2054, 6, 1, 1.562
112, 0, 0, 0, 0.736
112, 0, 0, 1, 0.736
112, 7, 0, 0, 0.675
112, 7, 0, 1, 0.675
112, 0, 7, 0, 0.778
112, 0, 7, 1, 0.778
112, 7, 7, 0, 0.909
112, 7, 7, 1, 0.909
112, 2048, 0, 0, 0.736
112, 2048, 0, 1, 0.736
112, 2055, 0, 0, 0.675
112, 2055, 0, 1, 0.675
112, 2048, 7, 0, 0.778
112, 2048, 7, 1, 0.778
112, 2055, 7, 0, 0.909
112, 2055, 7, 1, 0.909
144, 0, 0, 0, 0.857
144, 0, 0, 1, 0.857
144, 9, 0, 0, 0.941
144, 9, 0, 1, 0.943
144, 0, 9, 0, 1.137
144, 0, 9, 1, 1.137
144, 9, 9, 0, 1.514
144, 9, 9, 1, 1.514
144, 2048, 0, 0, 0.857
144, 2048, 0, 1, 0.857
144, 2057, 0, 0, 0.939
144, 2057, 0, 1, 0.945
144, 2048, 9, 0, 0.922
144, 2048, 9, 1, 0.922
144, 2057, 9, 0, 1.514
144, 2057, 9, 1, 1.514
160, 0, 0, 0, 0.698
160, 0, 0, 1, 0.698
160, 10, 0, 0, 0.91
160, 10, 0, 1, 0.91
160, 0, 10, 0, 1.211
160, 0, 10, 1, 1.212
160, 10, 10, 0, 1.357
160, 10, 10, 1, 1.357
160, 2048, 0, 0, 0.698
160, 2048, 0, 1, 0.698
160, 2058, 0, 0, 0.91
160, 2058, 0, 1, 0.91
160, 2048, 10, 0, 0.923
160, 2048, 10, 1, 0.923
160, 2058, 10, 0, 1.357
160, 2058, 10, 1, 1.357
176, 0, 0, 0, 0.796
176, 0, 0, 1, 0.796
176, 11, 0, 0, 0.804
176, 11, 0, 1, 0.804
176, 0, 11, 0, 0.774
176, 0, 11, 1, 0.774
176, 11, 11, 0, 0.814
176, 11, 11, 1, 0.814
176, 2048, 0, 0, 0.796
176, 2048, 0, 1, 0.796
176, 2059, 0, 0, 0.804
176, 2059, 0, 1, 0.804
176, 2048, 11, 0, 0.774
176, 2048, 11, 1, 0.774
176, 2059, 11, 0, 0.814
176, 2059, 11, 1, 0.814
192, 0, 0, 0, 0.778
192, 0, 0, 1, 0.778
192, 12, 0, 0, 0.881
192, 12, 0, 1, 0.881
192, 0, 12, 0, 1.167
192, 0, 12, 1, 1.167
192, 12, 12, 0, 0.841
192, 12, 12, 1, 0.841
192, 2048, 0, 0, 0.778
192, 2048, 0, 1, 0.778
192, 2060, 0, 0, 0.881
192, 2060, 0, 1, 0.881
192, 2048, 12, 0, 0.889
192, 2048, 12, 1, 0.889
192, 2060, 12, 0, 0.906
192, 2060, 12, 1, 0.906
208, 0, 0, 0, 0.833
208, 0, 0, 1, 0.833
208, 13, 0, 0, 0.921
208, 13, 0, 1, 0.921
208, 0, 13, 0, 0.835
208, 0, 13, 1, 0.833
208, 13, 13, 0, 1.333
208, 13, 13, 1, 1.333
208, 2048, 0, 0, 0.833
208, 2048, 0, 1, 0.833
208, 2061, 0, 0, 0.921
208, 2061, 0, 1, 0.921
208, 2048, 13, 0, 0.833
208, 2048, 13, 1, 0.833
208, 2061, 13, 0, 1.333
208, 2061, 13, 1, 1.333
224, 0, 0, 0, 0.93
224, 0, 0, 1, 0.93
224, 14, 0, 0, 1.0
224, 14, 0, 1, 1.0
224, 0, 14, 0, 1.15
224, 0, 14, 1, 1.15
224, 14, 14, 0, 1.452
224, 14, 14, 1, 1.452
224, 2048, 0, 0, 0.93
224, 2048, 0, 1, 0.93
224, 2062, 0, 0, 1.0
224, 2062, 0, 1, 1.0
224, 2048, 14, 0, 0.833
224, 2048, 14, 1, 0.833
224, 2062, 14, 0, 1.452
224, 2062, 14, 1, 1.452
240, 0, 0, 0, 0.909
240, 0, 0, 1, 0.909
240, 15, 0, 0, 0.797
240, 15, 0, 1, 0.797
240, 0, 15, 0, 0.771
240, 0, 15, 1, 0.771
240, 15, 15, 0, 0.93
240, 15, 15, 1, 0.93
240, 2048, 0, 0, 0.909
240, 2048, 0, 1, 0.909
240, 2063, 0, 0, 0.797
240, 2063, 0, 1, 0.797
240, 2048, 15, 0, 0.771
240, 2048, 15, 1, 0.771
240, 2063, 15, 0, 0.93
240, 2063, 15, 1, 0.93
272, 0, 0, 0, 0.9
272, 0, 0, 1, 0.9
272, 17, 0, 0, 1.015
272, 17, 0, 1, 1.015
272, 0, 17, 0, 0.926
272, 0, 17, 1, 0.927
272, 17, 17, 0, 0.892
272, 17, 17, 1, 0.892
272, 2048, 0, 0, 0.9
272, 2048, 0, 1, 0.9
272, 2065, 0, 0, 1.015
272, 2065, 0, 1, 1.015
272, 2048, 17, 0, 0.927
272, 2048, 17, 1, 0.927
272, 2065, 17, 0, 0.878
272, 2065, 17, 1, 0.878
288, 0, 0, 0, 0.882
288, 0, 0, 1, 0.882
288, 18, 0, 0, 0.803
288, 18, 0, 1, 0.803
288, 0, 18, 0, 0.768
288, 0, 18, 1, 0.768
288, 18, 18, 0, 0.882
288, 18, 18, 1, 0.882
288, 2048, 0, 0, 0.882
288, 2048, 0, 1, 0.882
288, 2066, 0, 0, 0.803
288, 2066, 0, 1, 0.803
288, 2048, 18, 0, 0.768
288, 2048, 18, 1, 0.768
288, 2066, 18, 0, 0.882
288, 2066, 18, 1, 0.882
304, 0, 0, 0, 0.865
304, 0, 0, 1, 0.865
304, 19, 0, 0, 0.944
304, 19, 0, 1, 0.944
304, 0, 19, 0, 0.943
304, 0, 19, 1, 0.943
304, 19, 19, 0, 0.956
304, 19, 19, 1, 0.956
304, 2048, 0, 0, 0.866
304, 2048, 0, 1, 0.865
304, 2067, 0, 0, 0.944
304, 2067, 0, 1, 0.944
304, 2048, 19, 0, 0.943
304, 2048, 19, 1, 0.943
304, 2067, 19, 0, 0.947
304, 2067, 19, 1, 0.947
320, 0, 0, 0, 0.944
320, 0, 0, 1, 0.944
320, 20, 0, 0, 0.962
320, 20, 0, 1, 0.962
320, 0, 20, 0, 1.214
320, 0, 20, 1, 1.214
320, 20, 20, 0, 1.365
320, 20, 20, 1, 1.365
320, 2048, 0, 0, 0.943
320, 2048, 0, 1, 0.943
320, 2068, 0, 0, 0.962
320, 2068, 0, 1, 0.962
320, 2048, 20, 0, 0.914
320, 2048, 20, 1, 0.914
320, 2068, 20, 0, 1.365
320, 2068, 20, 1, 1.365
336, 0, 0, 0, 1.0
336, 0, 0, 1, 1.0
336, 21, 0, 0, 0.986
336, 21, 0, 1, 0.986
336, 0, 21, 0, 0.853
336, 0, 21, 1, 0.853
336, 21, 21, 0, 0.843
336, 21, 21, 1, 0.843
336, 2048, 0, 0, 1.0
336, 2048, 0, 1, 1.0
336, 2069, 0, 0, 0.986
336, 2069, 0, 1, 0.986
336, 2048, 21, 0, 0.853
336, 2048, 21, 1, 0.853
336, 2069, 21, 0, 0.831
336, 2069, 21, 1, 0.831
352, 0, 0, 0, 0.98
352, 0, 0, 1, 0.98
352, 22, 0, 0, 0.811
352, 22, 0, 1, 0.811
352, 0, 22, 0, 0.882
352, 0, 22, 1, 0.882
352, 22, 22, 0, 1.1
352, 22, 22, 1, 1.1
352, 2048, 0, 0, 0.98
352, 2048, 0, 1, 0.98
352, 2070, 0, 0, 0.811
352, 2070, 0, 1, 0.811
352, 2048, 22, 0, 0.882
352, 2048, 22, 1, 0.882
352, 2070, 22, 0, 1.1
352, 2070, 22, 1, 1.1
368, 0, 0, 0, 1.058
368, 0, 0, 1, 1.058
368, 23, 0, 0, 1.0
368, 23, 0, 1, 1.0
368, 0, 23, 0, 0.948
368, 0, 23, 1, 0.948
368, 23, 23, 0, 0.723
368, 23, 23, 1, 0.723
368, 2048, 0, 0, 1.058
368, 2048, 0, 1, 1.058
368, 2071, 0, 0, 1.0
368, 2071, 0, 1, 1.0
368, 2048, 23, 0, 0.948
368, 2048, 23, 1, 0.948
368, 2071, 23, 0, 0.701
368, 2071, 23, 1, 0.701
384, 0, 0, 0, 1.012
384, 0, 0, 1, 1.012
384, 24, 0, 0, 1.04
384, 24, 0, 1, 1.04
384, 0, 24, 0, 1.154
384, 0, 24, 1, 1.154
384, 24, 24, 0, 1.423
384, 24, 24, 1, 1.423
384, 2048, 0, 0, 1.012
384, 2048, 0, 1, 1.012
384, 2072, 0, 0, 1.04
384, 2072, 0, 1, 1.04
384, 2048, 24, 0, 0.91
384, 2048, 24, 1, 0.91
384, 2072, 24, 0, 1.423
384, 2072, 24, 1, 1.423
400, 0, 0, 0, 0.948
400, 0, 0, 1, 0.948
400, 25, 0, 0, 0.957
400, 25, 0, 1, 0.957
400, 0, 25, 0, 1.099
400, 0, 25, 1, 1.069
400, 25, 25, 0, 0.885
400, 25, 25, 1, 0.885
400, 2048, 0, 0, 0.948
400, 2048, 0, 1, 0.948
400, 2073, 0, 0, 0.957
400, 2073, 0, 1, 0.957
400, 2048, 25, 0, 0.94
400, 2048, 25, 1, 0.94
400, 2073, 25, 0, 0.908
400, 2073, 25, 1, 0.908
416, 0, 0, 0, 1.017
416, 0, 0, 1, 1.017
416, 26, 0, 0, 0.903
416, 26, 0, 1, 0.903
416, 0, 26, 0, 0.881
416, 0, 26, 1, 0.881
416, 26, 26, 0, 1.035
416, 26, 26, 1, 1.035
416, 2048, 0, 0, 1.017
416, 2048, 0, 1, 1.017
416, 2074, 0, 0, 0.903
416, 2074, 0, 1, 0.903
416, 2048, 26, 0, 0.881
416, 2048, 26, 1, 0.881
416, 2074, 26, 0, 1.034
416, 2074, 26, 1, 1.035
432, 0, 0, 0, 1.0
432, 0, 0, 1, 1.0
432, 27, 0, 0, 0.933
432, 27, 0, 1, 0.933
432, 0, 27, 0, 0.941
432, 0, 27, 1, 0.941
432, 27, 27, 0, 0.953
432, 27, 27, 1, 0.954
432, 2048, 0, 0, 1.0
432, 2048, 0, 1, 1.0
432, 2075, 0, 0, 0.933
432, 2075, 0, 1, 0.933
432, 2048, 27, 0, 0.941
432, 2048, 27, 1, 0.941
432, 2075, 27, 0, 0.93
432, 2075, 27, 1, 0.93
448, 0, 0, 0, 0.984
448, 0, 0, 1, 0.984
448, 28, 0, 0, 0.896
448, 28, 0, 1, 0.896
448, 0, 28, 0, 1.244
448, 0, 28, 1, 1.244
448, 28, 28, 0, 1.333
448, 28, 28, 1, 1.333
448, 2048, 0, 0, 0.984
448, 2048, 0, 1, 0.984
448, 2076, 0, 0, 0.896
448, 2076, 0, 1, 0.896
448, 2048, 28, 0, 0.988
448, 2048, 28, 1, 0.988
448, 2076, 28, 0, 1.333
448, 2076, 28, 1, 1.333
464, 0, 0, 0, 1.083
464, 0, 0, 1, 1.083
464, 29, 0, 0, 0.978
464, 29, 0, 1, 0.978
464, 0, 29, 0, 0.924
464, 0, 29, 1, 0.924
464, 29, 29, 0, 0.901
464, 29, 29, 1, 0.901
464, 2048, 0, 0, 1.083
464, 2048, 0, 1, 1.083
464, 2077, 0, 0, 0.978
464, 2077, 0, 1, 0.978
464, 2048, 29, 0, 0.924
464, 2048, 29, 1, 0.924
464, 2077, 29, 0, 0.89
464, 2077, 29, 1, 0.89
480, 0, 0, 0, 1.066
480, 0, 0, 1, 1.066
480, 30, 0, 0, 0.9
480, 30, 0, 1, 0.9
480, 0, 30, 0, 0.88
480, 0, 30, 1, 0.88
480, 30, 30, 0, 1.083
480, 30, 30, 1, 1.083
480, 2048, 0, 0, 1.066
480, 2048, 0, 1, 1.066
480, 2078, 0, 0, 0.9
480, 2078, 0, 1, 0.9
480, 2048, 30, 0, 0.88
480, 2048, 30, 1, 0.88
480, 2078, 30, 0, 1.083
480, 2078, 30, 1, 1.083
496, 0, 0, 0, 1.032
496, 0, 0, 1, 1.032
496, 31, 0, 0, 0.95
496, 31, 0, 1, 0.95
496, 0, 31, 0, 1.011
496, 0, 31, 1, 1.011
496, 31, 31, 0, 0.973
496, 31, 31, 1, 0.973
496, 2048, 0, 0, 1.032
496, 2048, 0, 1, 1.032
496, 2079, 0, 0, 0.95
496, 2079, 0, 1, 0.95
496, 2048, 31, 0, 1.011
496, 2048, 31, 1, 1.011
496, 2079, 31, 0, 0.941
496, 2079, 31, 1, 0.941
1024, 32, 0, 0, 1.143
1024, 32, 0, 1, 1.143
1024, 0, 32, 0, 1.143
1024, 0, 32, 1, 1.143
1024, 32, 32, 0, 1.143
1024, 32, 32, 1, 1.143
1024, 2080, 0, 0, 1.143
1024, 2080, 0, 1, 1.143
1024, 2048, 32, 0, 1.143
1024, 2048, 32, 1, 1.143
1024, 2080, 32, 0, 1.143
1024, 2080, 32, 1, 1.143
1056, 0, 0, 0, 1.168
1056, 0, 0, 1, 1.168
1056, 33, 0, 0, 1.067
1056, 33, 0, 1, 1.067
1056, 0, 33, 0, 0.977
1056, 0, 33, 1, 0.977
1056, 33, 33, 0, 1.043
1056, 33, 33, 1, 1.043
1056, 2048, 0, 0, 1.168
1056, 2048, 0, 1, 1.168
1056, 2081, 0, 0, 1.067
1056, 2081, 0, 1, 1.067
1056, 2048, 33, 0, 0.977
1056, 2048, 33, 1, 0.977
1056, 2081, 33, 0, 1.0
1056, 2081, 33, 1, 1.0
1088, 0, 0, 0, 1.171
1088, 0, 0, 1, 1.171
1088, 34, 0, 0, 1.041
1088, 34, 0, 1, 1.041
1088, 0, 34, 0, 1.079
1088, 0, 34, 1, 1.079
1088, 34, 34, 0, 0.966
1088, 34, 34, 1, 0.966
1088, 2048, 0, 0, 1.171
1088, 2048, 0, 1, 1.171
1088, 2082, 0, 0, 1.041
1088, 2082, 0, 1, 1.041
1088, 2048, 34, 0, 0.994
1088, 2048, 34, 1, 0.994
1088, 2082, 34, 0, 0.966
1088, 2082, 34, 1, 0.966
1120, 0, 0, 0, 1.152
1120, 0, 0, 1, 1.153
1120, 35, 0, 0, 1.051
1120, 35, 0, 1, 1.051
1120, 0, 35, 0, 1.0
1120, 0, 35, 1, 1.0
1120, 35, 35, 0, 1.068
1120, 35, 35, 1, 1.068
1120, 2048, 0, 0, 1.151
1120, 2048, 0, 1, 1.151
1120, 2083, 0, 0, 1.051
1120, 2083, 0, 1, 1.051
1120, 2048, 35, 0, 1.0
1120, 2048, 35, 1, 1.0
1120, 2083, 35, 0, 1.027
1120, 2083, 35, 1, 1.027
1152, 0, 0, 0, 1.159
1152, 0, 0, 1, 1.159
1152, 36, 0, 0, 1.034
1152, 36, 0, 1, 1.034
1152, 0, 36, 0, 1.07
1152, 0, 36, 1, 1.07
1152, 36, 36, 0, 0.967
1152, 36, 36, 1, 0.967
1152, 2048, 0, 0, 1.159
1152, 2048, 0, 1, 1.159
1152, 2084, 0, 0, 1.034
1152, 2084, 0, 1, 1.034
1152, 2048, 36, 0, 0.984
1152, 2048, 36, 1, 0.984
1152, 2084, 36, 0, 0.967
1152, 2084, 36, 1, 0.967
1184, 0, 0, 0, 1.157
1184, 0, 0, 1, 1.157
1184, 37, 0, 0, 1.067
1184, 37, 0, 1, 1.066
1184, 0, 37, 0, 0.993
1184, 0, 37, 1, 0.993
1184, 37, 37, 0, 1.08
1184, 37, 37, 1, 1.081
1184, 2048, 0, 0, 1.157
1184, 2048, 0, 1, 1.157
1184, 2085, 0, 0, 1.066
1184, 2085, 0, 1, 1.066
1184, 2048, 37, 0, 0.993
1184, 2048, 37, 1, 0.993
1184, 2085, 37, 0, 1.04
1184, 2085, 37, 1, 1.04
1216, 0, 0, 0, 1.139
1216, 0, 0, 1, 1.139
1216, 38, 0, 0, 1.024
1216, 38, 0, 1, 1.024
1216, 0, 38, 0, 1.087
1216, 0, 38, 1, 1.087
1216, 38, 38, 0, 1.0
1216, 38, 38, 1, 1.0
1216, 2048, 0, 0, 1.138
1216, 2048, 0, 1, 1.138
1216, 2086, 0, 0, 1.024
1216, 2086, 0, 1, 1.024
1216, 2048, 38, 0, 1.01
1216, 2048, 38, 1, 1.01
1216, 2086, 38, 0, 1.0
1216, 2086, 38, 1, 1.0
1248, 0, 0, 0, 1.176
1248, 0, 0, 1, 1.174
1248, 39, 0, 0, 1.074
1248, 39, 0, 1, 1.074
1248, 0, 39, 0, 0.966
1248, 0, 39, 1, 0.985
1248, 39, 39, 0, 1.064
1248, 39, 39, 1, 1.064
1248, 2048, 0, 0, 1.179
1248, 2048, 0, 1, 1.179
1248, 2087, 0, 0, 1.074
1248, 2087, 0, 1, 1.074
1248, 2048, 39, 0, 0.985
1248, 2048, 39, 1, 0.985
1248, 2087, 39, 0, 1.026
1248, 2087, 39, 1, 1.026
1280, 0, 0, 0, 0.993
1280, 0, 0, 1, 0.993
1280, 40, 0, 0, 1.051
1280, 40, 0, 1, 1.051
1280, 0, 40, 0, 1.044
1280, 0, 40, 1, 1.045
1280, 40, 40, 0, 1.25
1280, 40, 40, 1, 1.25
1280, 2048, 0, 0, 0.992
1280, 2048, 0, 1, 0.992
1280, 2088, 0, 0, 1.051
1280, 2088, 0, 1, 1.051
1280, 2048, 40, 0, 0.946
1280, 2048, 40, 1, 0.946
1280, 2088, 40, 0, 1.252
1280, 2088, 40, 1, 1.252
1312, 0, 0, 0, 0.969
1312, 0, 0, 1, 0.969
1312, 41, 0, 0, 0.991
1312, 41, 0, 1, 0.991
1312, 0, 41, 0, 0.837
1312, 0, 41, 1, 0.837
1312, 41, 41, 0, 1.025
1312, 41, 41, 1, 1.025
1312, 2048, 0, 0, 0.969
1312, 2048, 0, 1, 0.969
1312, 2089, 0, 0, 0.991
1312, 2089, 0, 1, 0.99
1312, 2048, 41, 0, 0.837
1312, 2048, 41, 1, 0.837
1312, 2089, 41, 0, 0.975
1312, 2089, 41, 1, 0.975
1344, 0, 0, 0, 0.988
1344, 0, 0, 1, 0.988
1344, 42, 0, 0, 1.031
1344, 42, 0, 1, 1.031
1344, 0, 42, 0, 1.033
1344, 0, 42, 1, 1.033
1344, 42, 42, 0, 0.982
1344, 42, 42, 1, 0.982
1344, 2048, 0, 0, 0.992
1344, 2048, 0, 1, 0.992
1344, 2090, 0, 0, 1.031
1344, 2090, 0, 1, 1.031
1344, 2048, 42, 0, 0.943
1344, 2048, 42, 1, 0.942
1344, 2090, 42, 0, 0.982
1344, 2090, 42, 1, 0.982
1376, 0, 0, 0, 1.016
1376, 0, 0, 1, 1.016
1376, 43, 0, 0, 1.01
1376, 43, 0, 1, 1.01
1376, 0, 43, 0, 0.829
1376, 0, 43, 1, 0.829
1376, 43, 43, 0, 1.024
1376, 43, 43, 1, 1.024
1376, 2048, 0, 0, 1.006
1376, 2048, 0, 1, 1.015
1376, 2091, 0, 0, 1.01
1376, 2091, 0, 1, 1.01
1376, 2048, 43, 0, 0.829
1376, 2048, 43, 1, 0.829
1376, 2091, 43, 0, 0.98
1376, 2091, 43, 1, 0.98
1408, 0, 0, 0, 0.987
1408, 0, 0, 1, 0.987
1408, 44, 0, 0, 1.015
1408, 44, 0, 1, 1.015
1408, 0, 44, 0, 1.018
1408, 0, 44, 1, 1.014
1408, 44, 44, 0, 1.004
1408, 44, 44, 1, 0.994
1408, 2048, 0, 0, 0.988
1408, 2048, 0, 1, 0.988
1408, 2092, 0, 0, 1.015
1408, 2092, 0, 1, 1.015
1408, 2048, 44, 0, 0.955
1408, 2048, 44, 1, 0.955
1408, 2092, 44, 0, 1.0
1408, 2092, 44, 1, 0.994
1440, 0, 0, 0, 0.986
1440, 0, 0, 1, 0.986
1440, 45, 0, 0, 1.013
1440, 45, 0, 1, 1.013
1440, 0, 45, 0, 0.814
1440, 0, 45, 1, 0.814
1440, 45, 45, 0, 1.006
1440, 45, 45, 1, 1.006
1440, 2048, 0, 0, 0.986
1440, 2048, 0, 1, 0.986
1440, 2093, 0, 0, 1.013
1440, 2093, 0, 1, 1.013
1440, 2048, 45, 0, 0.814
1440, 2048, 45, 1, 0.814
1440, 2093, 45, 0, 0.966
1440, 2093, 45, 1, 0.966
1472, 0, 0, 0, 0.997
1472, 0, 0, 1, 0.994
1472, 46, 0, 0, 1.045
1472, 46, 0, 1, 1.045
1472, 0, 46, 0, 1.026
1472, 0, 46, 1, 1.026
1472, 46, 46, 0, 0.966
1472, 46, 46, 1, 0.966
1472, 2048, 0, 0, 1.0
1472, 2048, 0, 1, 0.996
1472, 2094, 0, 0, 1.045
1472, 2094, 0, 1, 1.045
1472, 2048, 46, 0, 0.939
1472, 2048, 46, 1, 0.939
1472, 2094, 46, 0, 0.966
1472, 2094, 46, 1, 0.966
1504, 0, 0, 0, 0.993
1504, 0, 0, 1, 0.993
1504, 47, 0, 0, 0.999
1504, 47, 0, 1, 0.999
1504, 0, 47, 0, 0.826
1504, 0, 47, 1, 0.826
1504, 47, 47, 0, 1.023
1504, 47, 47, 1, 1.023
1504, 2048, 0, 0, 0.993
1504, 2048, 0, 1, 0.993
1504, 2095, 0, 0, 0.999
1504, 2095, 0, 1, 0.999
1504, 2048, 47, 0, 0.826
1504, 2048, 47, 1, 0.826
1504, 2095, 47, 0, 0.993
1504, 2095, 47, 1, 0.993
1536, 0, 0, 0, 0.992
1536, 0, 0, 1, 0.991
1536, 48, 0, 0, 1.019
1536, 48, 0, 1, 1.019
1536, 0, 48, 0, 1.025
1536, 0, 48, 1, 1.024
1536, 48, 48, 0, 0.994
1536, 48, 48, 1, 0.994
1536, 2048, 0, 0, 0.994
1536, 2048, 0, 1, 0.994
1536, 2096, 0, 0, 1.019
1536, 2096, 0, 1, 1.019
1536, 2048, 48, 0, 1.025
1536, 2048, 48, 1, 1.025
1536, 2096, 48, 0, 0.994
1536, 2096, 48, 1, 0.994
1568, 0, 0, 0, 0.994
1568, 0, 0, 1, 0.994
1568, 49, 0, 0, 0.903
1568, 49, 0, 1, 0.903
1568, 0, 49, 0, 1.144
1568, 0, 49, 1, 1.144
1568, 49, 49, 0, 1.461
1568, 49, 49, 1, 1.461
1568, 2048, 0, 0, 0.993
1568, 2048, 0, 1, 0.993
1568, 2097, 0, 0, 0.903
1568, 2097, 0, 1, 0.903
1568, 2048, 49, 0, 1.09
1568, 2048, 49, 1, 1.09
1568, 2097, 49, 0, 1.46
1568, 2097, 49, 1, 1.46
1600, 0, 0, 0, 0.981
1600, 0, 0, 1, 0.981
1600, 50, 0, 0, 1.022
1600, 50, 0, 1, 1.022
1600, 0, 50, 0, 1.017
1600, 0, 50, 1, 1.017
1600, 50, 50, 0, 0.973
1600, 50, 50, 1, 0.973
1600, 2048, 0, 0, 0.981
1600, 2048, 0, 1, 0.981
1600, 2098, 0, 0, 1.022
1600, 2098, 0, 1, 1.022
1600, 2048, 50, 0, 0.961
1600, 2048, 50, 1, 0.961
1600, 2098, 50, 0, 0.973
1600, 2098, 50, 1, 0.973
1632, 0, 0, 0, 1.019
1632, 0, 0, 1, 1.019
1632, 51, 0, 0, 0.893
1632, 51, 0, 1, 0.893
1632, 0, 51, 0, 1.131
1632, 0, 51, 1, 1.131
1632, 51, 51, 0, 1.444
1632, 51, 51, 1, 1.444
1632, 2048, 0, 0, 1.019
1632, 2048, 0, 1, 1.019
1632, 2099, 0, 0, 0.893
1632, 2099, 0, 1, 0.893
1632, 2048, 51, 0, 1.079
1632, 2048, 51, 1, 1.079
1632, 2099, 51, 0, 1.449
1632, 2099, 51, 1, 1.449
1664, 0, 0, 0, 1.005
1664, 0, 0, 1, 1.004
1664, 52, 0, 0, 0.986
1664, 52, 0, 1, 0.986
1664, 0, 52, 0, 1.004
1664, 0, 52, 1, 1.004
1664, 52, 52, 0, 0.976
1664, 52, 52, 1, 0.976
1664, 2048, 0, 0, 1.006
1664, 2048, 0, 1, 1.006
1664, 2100, 0, 0, 0.993
1664, 2100, 0, 1, 0.993
1664, 2048, 52, 0, 0.946
1664, 2048, 52, 1, 0.946
1664, 2100, 52, 0, 0.976
1664, 2100, 52, 1, 0.976
1696, 0, 0, 0, 0.994
1696, 0, 0, 1, 0.992
1696, 53, 0, 0, 0.884
1696, 53, 0, 1, 0.884
1696, 0, 53, 0, 1.141
1696, 0, 53, 1, 1.141
1696, 53, 53, 0, 1.43
1696, 53, 53, 1, 1.43
1696, 2048, 0, 0, 0.994
1696, 2048, 0, 1, 0.994
1696, 2101, 0, 0, 0.884
1696, 2101, 0, 1, 0.884
1696, 2048, 53, 0, 1.088
1696, 2048, 53, 1, 1.088
1696, 2101, 53, 0, 1.429
1696, 2101, 53, 1, 1.429
1728, 0, 0, 0, 0.978
1728, 0, 0, 1, 0.978
1728, 54, 0, 0, 1.031
1728, 54, 0, 1, 1.033
1728, 0, 54, 0, 1.0
1728, 0, 54, 1, 1.0
1728, 54, 54, 0, 0.96
1728, 54, 54, 1, 0.96
1728, 2048, 0, 0, 0.976
1728, 2048, 0, 1, 0.976
1728, 2102, 0, 0, 1.033
1728, 2102, 0, 1, 1.033
1728, 2048, 54, 0, 0.947
1728, 2048, 54, 1, 0.947
1728, 2102, 54, 0, 0.96
1728, 2102, 54, 1, 0.96
1760, 0, 0, 0, 1.019
1760, 0, 0, 1, 1.021
1760, 55, 0, 0, 0.9
1760, 55, 0, 1, 0.9
1760, 0, 55, 0, 1.125
1760, 0, 55, 1, 1.125
1760, 55, 55, 0, 1.437
1760, 55, 55, 1, 1.436
1760, 2048, 0, 0, 1.016
1760, 2048, 0, 1, 1.015
1760, 2103, 0, 0, 0.9
1760, 2103, 0, 1, 0.9
1760, 2048, 55, 0, 1.073
1760, 2048, 55, 1, 1.074
1760, 2103, 55, 0, 1.44
1760, 2103, 55, 1, 1.44
1792, 0, 0, 0, 1.002
1792, 0, 0, 1, 1.002
1792, 56, 0, 0, 1.028
1792, 56, 0, 1, 1.028
1792, 0, 56, 0, 1.014
1792, 0, 56, 1, 1.015
1792, 56, 56, 0, 1.191
1792, 56, 56, 1, 1.191
1792, 2048, 0, 0, 1.003
1792, 2048, 0, 1, 1.003
1792, 2104, 0, 0, 1.028
1792, 2104, 0, 1, 1.028
1792, 2048, 56, 0, 0.963
1792, 2048, 56, 1, 0.963
1792, 2104, 56, 0, 1.191
1792, 2104, 56, 1, 1.191
1824, 0, 0, 0, 0.999
1824, 0, 0, 1, 1.0
1824, 57, 0, 0, 0.891
1824, 57, 0, 1, 0.891
1824, 0, 57, 0, 1.114
1824, 0, 57, 1, 1.114
1824, 57, 57, 0, 1.407
1824, 57, 57, 1, 1.407
1824, 2048, 0, 0, 1.001
1824, 2048, 0, 1, 1.001
1824, 2105, 0, 0, 0.891
1824, 2105, 0, 1, 0.891
1824, 2048, 57, 0, 1.064
1824, 2048, 57, 1, 1.064
1824, 2105, 57, 0, 1.407
1824, 2105, 57, 1, 1.407
1856, 0, 0, 0, 0.989
1856, 0, 0, 1, 0.987
1856, 58, 0, 0, 1.042
1856, 58, 0, 1, 1.042
1856, 0, 58, 0, 1.007
1856, 0, 58, 1, 1.007
1856, 58, 58, 0, 0.978
1856, 58, 58, 1, 0.972
1856, 2048, 0, 0, 0.992
1856, 2048, 0, 1, 0.992
1856, 2106, 0, 0, 1.042
1856, 2106, 0, 1, 1.042
1856, 2048, 58, 0, 0.954
1856, 2048, 58, 1, 0.954
1856, 2106, 58, 0, 0.979
1856, 2106, 58, 1, 0.972
1888, 0, 0, 0, 0.994
1888, 0, 0, 1, 0.994
1888, 59, 0, 0, 0.883
1888, 59, 0, 1, 0.883
1888, 0, 59, 0, 1.121
1888, 0, 59, 1, 1.123
1888, 59, 59, 0, 1.413
1888, 59, 59, 1, 1.413
1888, 2048, 0, 0, 0.985
1888, 2048, 0, 1, 0.994
1888, 2107, 0, 0, 0.883
1888, 2107, 0, 1, 0.883
1888, 2048, 59, 0, 1.076
1888, 2048, 59, 1, 1.076
1888, 2107, 59, 0, 1.413
1888, 2107, 59, 1, 1.413
1920, 0, 0, 0, 1.0
1920, 0, 0, 1, 0.999
1920, 60, 0, 0, 1.033
1920, 60, 0, 1, 1.033
1920, 0, 60, 0, 0.996
1920, 0, 60, 1, 0.997
1920, 60, 60, 0, 0.968
1920, 60, 60, 1, 0.968
1920, 2048, 0, 0, 1.0
1920, 2048, 0, 1, 1.0
1920, 2108, 0, 0, 1.034
1920, 2108, 0, 1, 1.034
1920, 2048, 60, 0, 0.949
1920, 2048, 60, 1, 0.949
1920, 2108, 60, 0, 0.968
1920, 2108, 60, 1, 0.968
1952, 0, 0, 0, 1.004
1952, 0, 0, 1, 1.004
1952, 61, 0, 0, 0.898
1952, 61, 0, 1, 0.898
1952, 0, 61, 0, 1.118
1952, 0, 61, 1, 1.118
1952, 61, 61, 0, 1.387
1952, 61, 61, 1, 1.387
1952, 2048, 0, 0, 1.004
1952, 2048, 0, 1, 1.004
1952, 2109, 0, 0, 0.898
1952, 2109, 0, 1, 0.898
1952, 2048, 61, 0, 1.071
1952, 2048, 61, 1, 1.071
1952, 2109, 61, 0, 1.387
1952, 2109, 61, 1, 1.387
1984, 0, 0, 0, 0.993
1984, 0, 0, 1, 0.993
1984, 62, 0, 0, 1.025
1984, 62, 0, 1, 1.025
1984, 0, 62, 0, 1.005
1984, 0, 62, 1, 1.007
1984, 62, 62, 0, 0.982
1984, 62, 62, 1, 0.982
1984, 2048, 0, 0, 0.993
1984, 2048, 0, 1, 0.993
1984, 2110, 0, 0, 1.025
1984, 2110, 0, 1, 1.025
1984, 2048, 62, 0, 0.96
1984, 2048, 62, 1, 0.96
1984, 2110, 62, 0, 0.982
1984, 2110, 62, 1, 0.982
2016, 0, 0, 0, 1.0
2016, 0, 0, 1, 0.999
2016, 63, 0, 0, 0.889
2016, 63, 0, 1, 0.89
2016, 0, 63, 0, 1.091
2016, 0, 63, 1, 1.092
2016, 63, 63, 0, 1.362
2016, 63, 63, 1, 1.363
2016, 2048, 0, 0, 1.0
2016, 2048, 0, 1, 1.0
2016, 2111, 0, 0, 0.965
2016, 2111, 0, 1, 0.965
2016, 2048, 63, 0, 1.049
2016, 2048, 63, 1, 1.049
2016, 2111, 63, 0, 1.405
2016, 2111, 63, 1, 1.405
2048, 32, 0, 0, 1.01
2048, 32, 0, 1, 1.01
2048, 0, 32, 0, 1.005
2048, 0, 32, 1, 1.005
2048, 32, 32, 0, 1.005
2048, 32, 32, 1, 1.005
2048, 0, 1, 0, 0.983
2048, 0, 1, 1, 0.984
2048, 1, 0, 0, 1.039
2048, 1, 0, 1, 1.039
2048, 32, 1, 0, 1.063
2048, 32, 1, 1, 1.063
2048, 1, 32, 0, 0.94
2048, 1, 32, 1, 0.94
2048, 2048, 1, 0, 0.981
2048, 2048, 1, 1, 0.981
2048, 2049, 0, 0, 0.904
2048, 2049, 0, 1, 0.904
2112, 0, 0, 0, 0.996
2112, 0, 0, 1, 0.995
2112, 1, 0, 0, 1.031
2112, 1, 0, 1, 1.031
2112, 33, 0, 0, 1.01
2112, 33, 0, 1, 1.01
2112, 0, 1, 0, 0.972
2112, 0, 1, 1, 0.972
2112, 0, 33, 0, 0.987
2112, 0, 33, 1, 0.987
2112, 1, 1, 0, 0.914
2112, 1, 1, 1, 0.914
2112, 33, 33, 0, 0.983
2112, 33, 33, 1, 0.983
2112, 2048, 0, 0, 0.994
2112, 2048, 0, 1, 0.99
2112, 2049, 0, 0, 1.031
2112, 2049, 0, 1, 1.031
2112, 2048, 1, 0, 0.955
2112, 2048, 1, 1, 0.955
2112, 2049, 1, 0, 0.906
2112, 2049, 1, 1, 0.906
2112, 33, 1, 0, 1.163
2112, 33, 1, 1, 1.164
2112, 1, 33, 0, 1.046
2112, 1, 33, 1, 1.046
2176, 0, 0, 0, 0.984
2176, 0, 0, 1, 0.985
2176, 2, 0, 0, 1.023
2176, 2, 0, 1, 1.023
2176, 34, 0, 0, 1.0
2176, 34, 0, 1, 1.0
2176, 0, 2, 0, 0.985
2176, 0, 2, 1, 0.985
2176, 0, 34, 0, 0.995
2176, 0, 34, 1, 0.982
2176, 2, 2, 0, 0.928
2176, 2, 2, 1, 0.928
2176, 34, 34, 0, 1.004
2176, 34, 34, 1, 1.004
2176, 2048, 0, 0, 0.985
2176, 2048, 0, 1, 0.986
2176, 2050, 0, 0, 1.023
2176, 2050, 0, 1, 1.023
2176, 2048, 2, 0, 0.802
2176, 2048, 2, 1, 0.802
2176, 2050, 2, 0, 0.894
2176, 2050, 2, 1, 0.894
2176, 2, 1, 0, 1.068
2176, 2, 1, 1, 1.068
2176, 1, 2, 0, 0.976
2176, 1, 2, 1, 0.976
2176, 34, 1, 0, 1.077
2176, 34, 1, 1, 1.077
2176, 1, 34, 0, 0.978
2176, 1, 34, 1, 0.978
2176, 2050, 1, 0, 1.061
2176, 2050, 1, 1, 1.061
2176, 2049, 2, 0, 0.971
2176, 2049, 2, 1, 0.971
2240, 0, 0, 0, 0.994
2240, 0, 0, 1, 0.994
2240, 3, 0, 0, 1.038
2240, 3, 0, 1, 1.039
2240, 35, 0, 0, 1.019
2240, 35, 0, 1, 1.019
2240, 0, 3, 0, 0.979
2240, 0, 3, 1, 0.98
2240, 0, 35, 0, 0.991
2240, 0, 35, 1, 0.991
2240, 3, 3, 0, 0.931
2240, 3, 3, 1, 0.931
2240, 35, 35, 0, 0.999
2240, 35, 35, 1, 0.999
2240, 2048, 0, 0, 0.995
2240, 2048, 0, 1, 0.995
2240, 2051, 0, 0, 1.039
2240, 2051, 0, 1, 1.039
2240, 2048, 3, 0, 0.799
2240, 2048, 3, 1, 0.799
2240, 2051, 3, 0, 0.889
2240, 2051, 3, 1, 0.889
2240, 3, 1, 0, 1.06
2240, 3, 1, 1, 1.06
2240, 1, 3, 0, 0.968
2240, 1, 3, 1, 0.968
2240, 35, 1, 0, 1.071
2240, 35, 1, 1, 1.071
2240, 1, 35, 0, 0.971
2240, 1, 35, 1, 0.971
2240, 2051, 1, 0, 1.057
2240, 2051, 1, 1, 1.057
2240, 2049, 3, 0, 0.966
2240, 2049, 3, 1, 0.966
2304, 0, 0, 0, 0.986
2304, 0, 0, 1, 0.986
2304, 4, 0, 0, 1.031
2304, 4, 0, 1, 1.032
2304, 36, 0, 0, 1.011
2304, 36, 0, 1, 1.011
2304, 0, 4, 0, 0.968
2304, 0, 4, 1, 0.969
2304, 0, 36, 0, 0.988
2304, 0, 36, 1, 0.988
2304, 4, 4, 0, 0.93
2304, 4, 4, 1, 0.931
2304, 36, 36, 0, 0.992
2304, 36, 36, 1, 0.992
2304, 2048, 0, 0, 0.988
2304, 2048, 0, 1, 0.988
2304, 2052, 0, 0, 1.032
2304, 2052, 0, 1, 1.032
2304, 2048, 4, 0, 0.793
2304, 2048, 4, 1, 0.793
2304, 2052, 4, 0, 0.884
2304, 2052, 4, 1, 0.884
2304, 4, 1, 0, 0.989
2304, 4, 1, 1, 0.989
2304, 1, 4, 0, 0.897
2304, 1, 4, 1, 0.898
2304, 36, 1, 0, 1.057
2304, 36, 1, 1, 1.057
2304, 1, 36, 0, 0.966
2304, 1, 36, 1, 0.966
2304, 2052, 1, 0, 1.052
2304, 2052, 1, 1, 1.052
2304, 2049, 4, 0, 0.955
2304, 2049, 4, 1, 0.955
2368, 0, 0, 0, 1.0
2368, 0, 0, 1, 1.001
2368, 5, 0, 0, 1.024
2368, 5, 0, 1, 1.025
2368, 37, 0, 0, 1.0
2368, 37, 0, 1, 1.0
2368, 0, 5, 0, 0.98
2368, 0, 5, 1, 0.981
2368, 0, 37, 0, 0.983
2368, 0, 37, 1, 0.98
2368, 5, 5, 0, 0.944
2368, 5, 5, 1, 0.944
2368, 37, 37, 0, 1.003
2368, 37, 37, 1, 1.003
2368, 2048, 0, 0, 1.002
2368, 2048, 0, 1, 1.002
2368, 2053, 0, 0, 1.025
2368, 2053, 0, 1, 1.025
2368, 2048, 5, 0, 0.801
2368, 2048, 5, 1, 0.801
2368, 2053, 5, 0, 0.907
2368, 2053, 5, 1, 0.907
2368, 5, 1, 0, 1.071
2368, 5, 1, 1, 1.071
2368, 1, 5, 0, 0.973
2368, 1, 5, 1, 0.973
2368, 37, 1, 0, 1.07
2368, 37, 1, 1, 1.07
2368, 1, 37, 0, 0.974
2368, 1, 37, 1, 0.974
2368, 2053, 1, 0, 1.065
2368, 2053, 1, 1, 1.065
2368, 2049, 5, 0, 0.967
2368, 2049, 5, 1, 0.967
2432, 0, 0, 0, 0.965
2432, 0, 0, 1, 1.0
2432, 6, 0, 0, 1.038
2432, 6, 0, 1, 1.039
2432, 38, 0, 0, 1.021
2432, 38, 0, 1, 1.021
2432, 0, 6, 0, 0.974
2432, 0, 6, 1, 0.976
2432, 0, 38, 0, 0.986
2432, 0, 38, 1, 0.986
2432, 6, 6, 0, 0.926
2432, 6, 6, 1, 0.926
2432, 38, 38, 0, 1.0
2432, 38, 38, 1, 1.0
2432, 2048, 0, 0, 1.004
2432, 2048, 0, 1, 1.004
2432, 2054, 0, 0, 1.039
2432, 2054, 0, 1, 1.039
2432, 2048, 6, 0, 0.797
2432, 2048, 6, 1, 0.797
2432, 2054, 6, 0, 0.898
2432, 2054, 6, 1, 0.898
2432, 6, 1, 0, 1.063
2432, 6, 1, 1, 1.063
2432, 1, 6, 0, 0.965
2432, 1, 6, 1, 0.965
2432, 38, 1, 0, 1.068
2432, 38, 1, 1, 1.068
2432, 1, 38, 0, 0.968
2432, 1, 38, 1, 0.968
2432, 2054, 1, 0, 1.06
2432, 2054, 1, 1, 1.06
2432, 2049, 6, 0, 0.963
2432, 2049, 6, 1, 0.963
2496, 0, 0, 0, 1.013
2496, 0, 0, 1, 1.013
2496, 7, 0, 0, 1.032
2496, 7, 0, 1, 1.032
2496, 39, 0, 0, 1.013
2496, 39, 0, 1, 1.013
2496, 0, 7, 0, 0.965
2496, 0, 7, 1, 0.965
2496, 0, 39, 0, 0.979
2496, 0, 39, 1, 0.979
2496, 7, 7, 0, 0.925
2496, 7, 7, 1, 0.925
2496, 39, 39, 0, 0.989
2496, 39, 39, 1, 0.989
2496, 2048, 0, 0, 1.013
2496, 2048, 0, 1, 1.013
2496, 2055, 0, 0, 1.032
2496, 2055, 0, 1, 1.032
2496, 2048, 7, 0, 0.792
2496, 2048, 7, 1, 0.792
2496, 2055, 7, 0, 0.93
2496, 2055, 7, 1, 0.93
2496, 7, 1, 0, 0.984
2496, 7, 1, 1, 0.984
2496, 1, 7, 0, 0.894
2496, 1, 7, 1, 0.895
2496, 39, 1, 0, 1.054
2496, 39, 1, 1, 1.054
2496, 1, 39, 0, 0.963
2496, 1, 39, 1, 0.963
2496, 2055, 1, 0, 1.049
2496, 2055, 1, 1, 1.049
2496, 2049, 7, 0, 0.953
2496, 2049, 7, 1, 0.953
2560, 0, 0, 0, 0.991
2560, 0, 0, 1, 0.991
2560, 8, 0, 0, 1.031
2560, 8, 0, 1, 1.032
2560, 40, 0, 0, 1.029
2560, 40, 0, 1, 1.029
2560, 0, 8, 0, 0.992
2560, 0, 8, 1, 0.992
2560, 0, 40, 0, 0.975
2560, 0, 40, 1, 0.984
2560, 8, 8, 0, 0.942
2560, 8, 8, 1, 0.943
2560, 40, 40, 0, 1.139
2560, 40, 40, 1, 1.139
2560, 2048, 0, 0, 0.993
2560, 2048, 0, 1, 0.993
2560, 2056, 0, 0, 1.032
2560, 2056, 0, 1, 1.032
2560, 2048, 8, 0, 0.812
2560, 2048, 8, 1, 0.812
2560, 2056, 8, 0, 0.912
2560, 2056, 8, 1, 0.912
2560, 8, 1, 0, 1.068
2560, 8, 1, 1, 1.069
2560, 1, 8, 0, 0.974
2560, 1, 8, 1, 0.974
2560, 40, 1, 0, 1.068
2560, 40, 1, 1, 1.068
2560, 1, 40, 0, 0.996
2560, 1, 40, 1, 0.996
2560, 2056, 1, 0, 1.063
2560, 2056, 1, 1, 1.063
2560, 2049, 8, 0, 0.969
2560, 2049, 8, 1, 0.969
2624, 0, 0, 0, 0.995
2624, 0, 0, 1, 0.994
2624, 9, 0, 0, 1.015
2624, 9, 0, 1, 1.018
2624, 41, 0, 0, 1.044
2624, 41, 0, 1, 1.044
2624, 0, 9, 0, 0.988
2624, 0, 9, 1, 0.99
2624, 0, 41, 0, 0.989
2624, 0, 41, 1, 0.99
2624, 9, 9, 0, 0.943
2624, 9, 9, 1, 0.943
2624, 41, 41, 0, 0.993
2624, 41, 41, 1, 0.993
2624, 2048, 0, 0, 0.998
2624, 2048, 0, 1, 0.998
2624, 2057, 0, 0, 1.018
2624, 2057, 0, 1, 1.018
2624, 2048, 9, 0, 0.81
2624, 2048, 9, 1, 0.81
2624, 2057, 9, 0, 0.907
2624, 2057, 9, 1, 0.907
2624, 9, 1, 0, 1.09
2624, 9, 1, 1, 1.09
2624, 1, 9, 0, 0.967
2624, 1, 9, 1, 0.967
2624, 41, 1, 0, 1.084
2624, 41, 1, 1, 1.085
2624, 1, 41, 0, 0.958
2624, 1, 41, 1, 0.957
2624, 2057, 1, 0, 1.087
2624, 2057, 1, 1, 1.087
2624, 2049, 9, 0, 0.965
2624, 2049, 9, 1, 0.965
2688, 0, 0, 0, 0.995
2688, 0, 0, 1, 0.995
2688, 10, 0, 0, 1.01
2688, 10, 0, 1, 1.012
2688, 42, 0, 0, 1.036
2688, 42, 0, 1, 1.036
2688, 0, 10, 0, 0.978
2688, 0, 10, 1, 0.979
2688, 0, 42, 0, 0.977
2688, 0, 42, 1, 0.978
2688, 10, 10, 0, 0.942
2688, 10, 10, 1, 0.942
2688, 42, 42, 0, 0.989
2688, 42, 42, 1, 0.989
2688, 2048, 0, 0, 0.995
2688, 2048, 0, 1, 0.995
2688, 2058, 0, 0, 1.012
2688, 2058, 0, 1, 1.012
2688, 2048, 10, 0, 0.804
2688, 2048, 10, 1, 0.804
2688, 2058, 10, 0, 0.905
2688, 2058, 10, 1, 0.905
2688, 10, 1, 0, 0.986
2688, 10, 1, 1, 0.987
2688, 1, 10, 0, 0.893
2688, 1, 10, 1, 0.894
2688, 42, 1, 0, 1.054
2688, 42, 1, 1, 1.054
2688, 1, 42, 0, 0.958
2688, 1, 42, 1, 0.958
2688, 2058, 1, 0, 1.052
2688, 2058, 1, 1, 1.052
2688, 2049, 10, 0, 0.954
2688, 2049, 10, 1, 0.954
2752, 0, 0, 0, 1.0
2752, 0, 0, 1, 0.992
2752, 11, 0, 0, 0.954
2752, 11, 0, 1, 0.954
2752, 43, 0, 0, 0.979
2752, 43, 0, 1, 0.979
2752, 0, 11, 0, 0.939
2752, 0, 11, 1, 0.939
2752, 0, 43, 0, 0.931
2752, 0, 43, 1, 0.932
2752, 11, 11, 0, 0.949
2752, 11, 11, 1, 0.949
2752, 43, 43, 0, 1.007
2752, 43, 43, 1, 1.007
2752, 2048, 0, 0, 0.993
2752, 2048, 0, 1, 0.993
2752, 2059, 0, 0, 0.954
2752, 2059, 0, 1, 0.954
2752, 2048, 11, 0, 0.77
2752, 2048, 11, 1, 0.77
2752, 2059, 11, 0, 0.916
2752, 2059, 11, 1, 0.916
2752, 11, 1, 0, 0.994
2752, 11, 1, 1, 0.994
2752, 1, 11, 0, 0.928
2752, 1, 11, 1, 0.928
2752, 43, 1, 0, 1.022
2752, 43, 1, 1, 1.022
2752, 1, 43, 0, 0.92
2752, 1, 43, 1, 0.92
2752, 2059, 1, 0, 0.989
2752, 2059, 1, 1, 0.989
2752, 2049, 11, 0, 0.923
2752, 2049, 11, 1, 0.923
2816, 0, 0, 0, 1.003
2816, 0, 0, 1, 1.003
2816, 12, 0, 0, 0.897
2816, 12, 0, 1, 0.894
2816, 44, 0, 0, 0.914
2816, 44, 0, 1, 0.914
2816, 0, 12, 0, 0.876
2816, 0, 12, 1, 0.874
2816, 0, 44, 0, 0.871
2816, 0, 44, 1, 0.87
2816, 12, 12, 0, 0.948
2816, 12, 12, 1, 0.948
2816, 44, 44, 0, 1.009
2816, 44, 44, 1, 1.009
2816, 2048, 0, 0, 1.005
2816, 2048, 0, 1, 1.005
2816, 2060, 0, 0, 0.894
2816, 2060, 0, 1, 0.894
2816, 2048, 12, 0, 0.714
2816, 2048, 12, 1, 0.713
2816, 2060, 12, 0, 0.915
2816, 2060, 12, 1, 0.915
2816, 12, 1, 0, 0.917
2816, 12, 1, 1, 0.917
2816, 1, 12, 0, 0.858
2816, 1, 12, 1, 0.857
2816, 44, 1, 0, 0.944
2816, 44, 1, 1, 0.943
2816, 1, 44, 0, 0.856
2816, 1, 44, 1, 0.856
2816, 2060, 1, 0, 0.914
2816, 2060, 1, 1, 0.914
2816, 2049, 12, 0, 0.855
2816, 2049, 12, 1, 0.855
2880, 0, 0, 0, 0.989
2880, 0, 0, 1, 0.989
2880, 13, 0, 0, 0.967
2880, 13, 0, 1, 0.967
2880, 45, 0, 0, 0.987
2880, 45, 0, 1, 0.987
2880, 0, 13, 0, 0.925
2880, 0, 13, 1, 0.925
2880, 0, 45, 0, 0.927
2880, 0, 45, 1, 0.927
2880, 13, 13, 0, 0.944
2880, 13, 13, 1, 0.944
2880, 45, 45, 0, 1.003
2880, 45, 45, 1, 1.003
2880, 2048, 0, 0, 0.989
2880, 2048, 0, 1, 0.989
2880, 2061, 0, 0, 0.967
2880, 2061, 0, 1, 0.967
2880, 2048, 13, 0, 0.76
2880, 2048, 13, 1, 0.76
2880, 2061, 13, 0, 0.91
2880, 2061, 13, 1, 0.91
2880, 13, 1, 0, 0.922
2880, 13, 1, 1, 0.922
2880, 1, 13, 0, 0.859
2880, 1, 13, 1, 0.859
2880, 45, 1, 0, 1.013
2880, 45, 1, 1, 1.013
2880, 1, 45, 0, 0.92
2880, 1, 45, 1, 0.92
2880, 2061, 1, 0, 0.984
2880, 2061, 1, 1, 0.984
2880, 2049, 13, 0, 0.918
2880, 2049, 13, 1, 0.918
2944, 0, 0, 0, 1.014
2944, 0, 0, 1, 1.014
2944, 14, 0, 0, 0.956
2944, 14, 0, 1, 0.955
2944, 46, 0, 0, 0.979
2944, 46, 0, 1, 0.979
2944, 0, 14, 0, 0.937
2944, 0, 14, 1, 0.937
2944, 0, 46, 0, 0.93
2944, 0, 46, 1, 0.93
2944, 14, 14, 0, 0.953
2944, 14, 14, 1, 0.953
2944, 46, 46, 0, 1.009
2944, 46, 46, 1, 1.009
2944, 2048, 0, 0, 1.015
2944, 2048, 0, 1, 1.015
2944, 2062, 0, 0, 0.955
2944, 2062, 0, 1, 0.955
2944, 2048, 14, 0, 0.769
2944, 2048, 14, 1, 0.769
2944, 2062, 14, 0, 0.923
2944, 2062, 14, 1, 0.923
2944, 14, 1, 0, 0.994
2944, 14, 1, 1, 0.994
2944, 1, 14, 0, 0.927
2944, 1, 14, 1, 0.927
2944, 46, 1, 0, 1.021
2944, 46, 1, 1, 1.021
2944, 1, 46, 0, 0.923
2944, 1, 46, 1, 0.923
2944, 2062, 1, 0, 0.988
2944, 2062, 1, 1, 0.988
2944, 2049, 14, 0, 0.922
2944, 2049, 14, 1, 0.922
3008, 0, 0, 0, 0.994
3008, 0, 0, 1, 0.994
3008, 15, 0, 0, 0.941
3008, 15, 0, 1, 0.941
3008, 47, 0, 0, 0.996
3008, 47, 0, 1, 0.996
3008, 0, 15, 0, 0.929
3008, 0, 15, 1, 0.933
3008, 0, 47, 0, 0.933
3008, 0, 47, 1, 0.933
3008, 15, 15, 0, 0.952
3008, 15, 15, 1, 0.949
3008, 47, 47, 0, 1.003
3008, 47, 47, 1, 1.003
3008, 2048, 0, 0, 0.998
3008, 2048, 0, 1, 0.998
3008, 2063, 0, 0, 0.941
3008, 2063, 0, 1, 0.941
3008, 2048, 15, 0, 0.766
3008, 2048, 15, 1, 0.766
3008, 2063, 15, 0, 0.916
3008, 2063, 15, 1, 0.916
3008, 15, 1, 0, 0.985
3008, 15, 1, 1, 0.985
3008, 1, 15, 0, 0.916
3008, 1, 15, 1, 0.916
3008, 47, 1, 0, 1.014
3008, 47, 1, 1, 1.014
3008, 1, 47, 0, 0.902
3008, 1, 47, 1, 0.902
3008, 2063, 1, 0, 0.981
3008, 2063, 1, 1, 0.981
3008, 2049, 15, 0, 0.912
3008, 2049, 15, 1, 0.913
3072, 0, 0, 0, 1.016
3072, 0, 0, 1, 1.015
3072, 16, 0, 0, 1.045
3072, 16, 0, 1, 1.045
3072, 48, 0, 0, 1.045
3072, 48, 0, 1, 1.045
3072, 0, 16, 0, 1.049
3072, 0, 16, 1, 1.049
3072, 0, 48, 0, 1.049
3072, 0, 48, 1, 1.049
3072, 16, 16, 0, 1.016
3072, 16, 16, 1, 1.016
3072, 48, 48, 0, 1.016
3072, 48, 48, 1, 1.016
3072, 2048, 0, 0, 1.016
3072, 2048, 0, 1, 1.016
3072, 2064, 0, 0, 1.045
3072, 2064, 0, 1, 1.045
3072, 2048, 16, 0, 1.049
3072, 2048, 16, 1, 1.049
3072, 2064, 16, 0, 1.016
3072, 2064, 16, 1, 1.016
3072, 16, 1, 0, 0.815
3072, 16, 1, 1, 0.815
3072, 1, 16, 0, 0.872
3072, 1, 16, 1, 0.872
3072, 48, 1, 0, 1.017
3072, 48, 1, 1, 1.017
3072, 1, 48, 0, 0.872
3072, 1, 48, 1, 0.872
3072, 2064, 1, 0, 0.815
3072, 2064, 1, 1, 0.815
3072, 2049, 16, 0, 0.872
3072, 2049, 16, 1, 0.872
3136, 0, 0, 0, 0.995
3136, 0, 0, 1, 0.995
3136, 17, 0, 0, 0.949
3136, 17, 0, 1, 0.949
3136, 49, 0, 0, 0.987
3136, 49, 0, 1, 0.987
3136, 0, 17, 0, 0.919
3136, 0, 17, 1, 0.917
3136, 0, 49, 0, 0.931
3136, 0, 49, 1, 0.931
3136, 17, 17, 0, 1.122
3136, 17, 17, 1, 1.119
3136, 49, 49, 0, 0.987
3136, 49, 49, 1, 0.987
3136, 2048, 0, 0, 0.997
3136, 2048, 0, 1, 0.997
3136, 2065, 0, 0, 0.949
3136, 2065, 0, 1, 0.949
3136, 2048, 17, 0, 0.896
3136, 2048, 17, 1, 0.896
3136, 2065, 17, 0, 1.122
3136, 2065, 17, 1, 1.119
3136, 17, 1, 0, 1.184
3136, 17, 1, 1, 1.184
3136, 1, 17, 0, 1.124
3136, 1, 17, 1, 1.125
3136, 49, 1, 0, 1.11
3136, 49, 1, 1, 1.108
3136, 1, 49, 0, 1.044
3136, 1, 49, 1, 1.044
3136, 2065, 1, 0, 1.147
3136, 2065, 1, 1, 1.147
3136, 2049, 17, 0, 1.102
3136, 2049, 17, 1, 1.1
3200, 0, 0, 0, 1.006
3200, 0, 0, 1, 1.006
3200, 18, 0, 0, 0.978
3200, 18, 0, 1, 0.978
3200, 50, 0, 0, 0.998
3200, 50, 0, 1, 0.998
3200, 0, 18, 0, 0.932
3200, 0, 18, 1, 0.932
3200, 0, 50, 0, 0.93
3200, 0, 50, 1, 0.93
3200, 18, 18, 0, 1.11
3200, 18, 18, 1, 1.11
3200, 50, 50, 0, 0.994
3200, 50, 50, 1, 0.994
3200, 2048, 0, 0, 1.007
3200, 2048, 0, 1, 1.007
3200, 2066, 0, 0, 0.978
3200, 2066, 0, 1, 0.978
3200, 2048, 18, 0, 0.894
3200, 2048, 18, 1, 0.894
3200, 2066, 18, 0, 1.11
3200, 2066, 18, 1, 1.11
3200, 18, 1, 0, 1.002
3200, 18, 1, 1, 1.002
3200, 1, 18, 0, 0.917
3200, 1, 18, 1, 0.917
3200, 50, 1, 0, 0.963
3200, 50, 1, 1, 0.964
3200, 1, 50, 0, 0.888
3200, 1, 50, 1, 0.888
3200, 2066, 1, 0, 1.002
3200, 2066, 1, 1, 1.002
3200, 2049, 18, 0, 0.914
3200, 2049, 18, 1, 0.914
3264, 0, 0, 0, 0.994
3264, 0, 0, 1, 0.994
3264, 19, 0, 0, 0.959
3264, 19, 0, 1, 0.959
3264, 51, 0, 0, 0.994
3264, 51, 0, 1, 0.994
3264, 0, 19, 0, 0.927
3264, 0, 19, 1, 0.927
3264, 0, 51, 0, 0.927
3264, 0, 51, 1, 0.927
3264, 19, 19, 0, 1.1
3264, 19, 19, 1, 1.1
3264, 51, 51, 0, 0.982
3264, 51, 51, 1, 0.982
3264, 2048, 0, 0, 0.994
3264, 2048, 0, 1, 0.994
3264, 2067, 0, 0, 0.959
3264, 2067, 0, 1, 0.959
3264, 2048, 19, 0, 0.891
3264, 2048, 19, 1, 0.891
3264, 2067, 19, 0, 1.099
3264, 2067, 19, 1, 1.099
3264, 19, 1, 0, 0.977
3264, 19, 1, 1, 0.976
3264, 1, 19, 0, 0.921
3264, 1, 19, 1, 0.921
3264, 51, 1, 0, 0.959
3264, 51, 1, 1, 0.959
3264, 1, 51, 0, 0.886
3264, 1, 51, 1, 0.886
3264, 2067, 1, 0, 0.976
3264, 2067, 1, 1, 0.976
3264, 2049, 19, 0, 0.917
3264, 2049, 19, 1, 0.917
3328, 0, 0, 0, 0.996
3328, 0, 0, 1, 0.992
3328, 20, 0, 0, 0.955
3328, 20, 0, 1, 0.955
3328, 52, 0, 0, 0.99
3328, 52, 0, 1, 0.99
3328, 0, 20, 0, 0.926
3328, 0, 20, 1, 0.923
3328, 0, 52, 0, 0.933
3328, 0, 52, 1, 0.933
3328, 20, 20, 0, 1.11
3328, 20, 20, 1, 1.11
3328, 52, 52, 0, 0.988
3328, 52, 52, 1, 0.988
3328, 2048, 0, 0, 0.993
3328, 2048, 0, 1, 0.993
3328, 2068, 0, 0, 0.955
3328, 2068, 0, 1, 0.955
3328, 2048, 20, 0, 0.9
3328, 2048, 20, 1, 0.9
3328, 2068, 20, 0, 1.109
3328, 2068, 20, 1, 1.109
3328, 20, 1, 0, 0.99
3328, 20, 1, 1, 0.99
3328, 1, 20, 0, 0.922
3328, 1, 20, 1, 0.922
3328, 52, 1, 0, 0.972
3328, 52, 1, 1, 0.972
3328, 1, 52, 0, 0.901
3328, 1, 52, 1, 0.901
3328, 2068, 1, 0, 0.99
3328, 2068, 1, 1, 0.99
3328, 2049, 20, 0, 0.918
3328, 2049, 20, 1, 0.918
3392, 0, 0, 0, 0.998
3392, 0, 0, 1, 1.0
3392, 21, 0, 0, 0.964
3392, 21, 0, 1, 0.964
3392, 53, 0, 0, 0.998
3392, 53, 0, 1, 0.998
3392, 0, 21, 0, 0.932
3392, 0, 21, 1, 0.932
3392, 0, 53, 0, 0.93
3392, 0, 53, 1, 0.93
3392, 21, 21, 0, 1.113
3392, 21, 21, 1, 1.113
3392, 53, 53, 0, 0.983
3392, 53, 53, 1, 0.983
3392, 2048, 0, 0, 1.0
3392, 2048, 0, 1, 1.0
3392, 2069, 0, 0, 0.964
3392, 2069, 0, 1, 0.964
3392, 2048, 21, 0, 0.895
3392, 2048, 21, 1, 0.896
3392, 2069, 21, 0, 1.113
3392, 2069, 21, 1, 1.113
3392, 21, 1, 0, 0.994
3392, 21, 1, 1, 0.994
3392, 1, 21, 0, 0.923
3392, 1, 21, 1, 0.923
3392, 53, 1, 0, 0.972
3392, 53, 1, 1, 0.972
3392, 1, 53, 0, 0.891
3392, 1, 53, 1, 0.891
3392, 2069, 1, 0, 0.994
3392, 2069, 1, 1, 0.994
3392, 2049, 21, 0, 0.922
3392, 2049, 21, 1, 0.922
3456, 0, 0, 0, 0.995
3456, 0, 0, 1, 0.995
3456, 22, 0, 0, 0.965
3456, 22, 0, 1, 0.965
3456, 54, 0, 0, 0.996
3456, 54, 0, 1, 0.996
3456, 0, 22, 0, 0.927
3456, 0, 22, 1, 0.927
3456, 0, 54, 0, 0.927
3456, 0, 54, 1, 0.927
3456, 22, 22, 0, 1.107
3456, 22, 22, 1, 1.107
3456, 54, 54, 0, 0.98
3456, 54, 54, 1, 0.98
3456, 2048, 0, 0, 0.995
3456, 2048, 0, 1, 0.995
3456, 2070, 0, 0, 0.965
3456, 2070, 0, 1, 0.965
3456, 2048, 22, 0, 0.893
3456, 2048, 22, 1, 0.893
3456, 2070, 22, 0, 1.107
3456, 2070, 22, 1, 1.107
3456, 22, 1, 0, 0.988
3456, 22, 1, 1, 0.988
3456, 1, 22, 0, 0.921
3456, 1, 22, 1, 0.921
3456, 54, 1, 0, 0.963
3456, 54, 1, 1, 0.963
3456, 1, 54, 0, 0.887
3456, 1, 54, 1, 0.887
3456, 2070, 1, 0, 0.988
3456, 2070, 1, 1, 0.988
3456, 2049, 22, 0, 0.917
3456, 2049, 22, 1, 0.917
3520, 0, 0, 0, 1.016
3520, 0, 0, 1, 1.016
3520, 23, 0, 0, 0.957
3520, 23, 0, 1, 0.957
3520, 55, 0, 0, 0.991
3520, 55, 0, 1, 0.991
3520, 0, 23, 0, 0.919
3520, 0, 23, 1, 0.924
3520, 0, 55, 0, 0.934
3520, 0, 55, 1, 0.934
3520, 23, 23, 0, 1.111
3520, 23, 23, 1, 1.111
3520, 55, 55, 0, 0.994
3520, 55, 55, 1, 0.994
3520, 2048, 0, 0, 1.016
3520, 2048, 0, 1, 1.016
3520, 2071, 0, 0, 0.957
3520, 2071, 0, 1, 0.957
3520, 2048, 23, 0, 0.903
3520, 2048, 23, 1, 0.903
3520, 2071, 23, 0, 1.111
3520, 2071, 23, 1, 1.111
3520, 23, 1, 0, 0.997
3520, 23, 1, 1, 0.997
3520, 1, 23, 0, 0.921
3520, 1, 23, 1, 0.921
3520, 55, 1, 0, 0.976
3520, 55, 1, 1, 0.976
3520, 1, 55, 0, 0.902
3520, 1, 55, 1, 0.902
3520, 2071, 1, 0, 0.997
3520, 2071, 1, 1, 0.997
3520, 2049, 23, 0, 0.918
3520, 2049, 23, 1, 0.918
3584, 0, 0, 0, 1.004
3584, 0, 0, 1, 1.004
3584, 24, 0, 0, 0.985
3584, 24, 0, 1, 0.979
3584, 56, 0, 0, 1.006
3584, 56, 0, 1, 1.006
3584, 0, 24, 0, 0.931
3584, 0, 24, 1, 0.931
3584, 0, 56, 0, 0.93
3584, 0, 56, 1, 0.93
3584, 24, 24, 0, 1.111
3584, 24, 24, 1, 1.11
3584, 56, 56, 0, 1.101
3584, 56, 56, 1, 1.1
3584, 2048, 0, 0, 1.005
3584, 2048, 0, 1, 1.005
3584, 2072, 0, 0, 0.98
3584, 2072, 0, 1, 0.978
3584, 2048, 24, 0, 0.896
3584, 2048, 24, 1, 0.897
3584, 2072, 24, 0, 1.111
3584, 2072, 24, 1, 1.111
3584, 24, 1, 0, 1.004
3584, 24, 1, 1, 1.004
3584, 1, 24, 0, 0.921
3584, 1, 24, 1, 0.921
3584, 56, 1, 0, 0.971
3584, 56, 1, 1, 0.97
3584, 1, 56, 0, 0.89
3584, 1, 56, 1, 0.89
3584, 2072, 1, 0, 1.004
3584, 2072, 1, 1, 1.004
3584, 2049, 24, 0, 0.918
3584, 2049, 24, 1, 0.918
3648, 0, 0, 0, 1.012
3648, 0, 0, 1, 1.012
3648, 25, 0, 0, 0.96
3648, 25, 0, 1, 0.96
3648, 57, 0, 0, 0.988
3648, 57, 0, 1, 0.988
3648, 0, 25, 0, 0.927
3648, 0, 25, 1, 0.927
3648, 0, 57, 0, 0.927
3648, 0, 57, 1, 0.927
3648, 25, 25, 0, 1.101
3648, 25, 25, 1, 1.101
3648, 57, 57, 0, 0.986
3648, 57, 57, 1, 0.986
3648, 2048, 0, 0, 1.012
3648, 2048, 0, 1, 1.012
3648, 2073, 0, 0, 0.96
3648, 2073, 0, 1, 0.959
3648, 2048, 25, 0, 0.894
3648, 2048, 25, 1, 0.895
3648, 2073, 25, 0, 1.103
3648, 2073, 25, 1, 1.103
3648, 25, 1, 0, 1.024
3648, 25, 1, 1, 1.024
3648, 1, 25, 0, 0.911
3648, 1, 25, 1, 0.912
3648, 57, 1, 0, 0.973
3648, 57, 1, 1, 0.974
3648, 1, 57, 0, 0.888
3648, 1, 57, 1, 0.888
3648, 2073, 1, 0, 1.024
3648, 2073, 1, 1, 1.024
3648, 2049, 25, 0, 0.907
3648, 2049, 25, 1, 0.907
3712, 0, 0, 0, 0.996
3712, 0, 0, 1, 0.996
3712, 26, 0, 0, 0.96
3712, 26, 0, 1, 0.96
3712, 58, 0, 0, 0.995
3712, 58, 0, 1, 0.995
3712, 0, 26, 0, 0.919
3712, 0, 26, 1, 0.918
3712, 0, 58, 0, 0.93
3712, 0, 58, 1, 0.93
3712, 26, 26, 0, 1.103
3712, 26, 26, 1, 1.102
3712, 58, 58, 0, 0.989
3712, 58, 58, 1, 0.989
3712, 2048, 0, 0, 0.997
3712, 2048, 0, 1, 0.997
3712, 2074, 0, 0, 0.959
3712, 2074, 0, 1, 0.959
3712, 2048, 26, 0, 0.901
3712, 2048, 26, 1, 0.901
3712, 2074, 26, 0, 1.104
3712, 2074, 26, 1, 1.102
3712, 26, 1, 0, 1.001
3712, 26, 1, 1, 1.001
3712, 1, 26, 0, 0.922
3712, 1, 26, 1, 0.922
3712, 58, 1, 0, 0.974
3712, 58, 1, 1, 0.974
3712, 1, 58, 0, 0.903
3712, 1, 58, 1, 0.903
3712, 2074, 1, 0, 1.001
3712, 2074, 1, 1, 1.001
3712, 2049, 26, 0, 0.919
3712, 2049, 26, 1, 0.919
3776, 0, 0, 0, 1.003
3776, 0, 0, 1, 1.003
3776, 27, 0, 0, 0.964
3776, 27, 0, 1, 0.964
3776, 59, 0, 0, 1.004
3776, 59, 0, 1, 1.004
3776, 0, 27, 0, 0.931
3776, 0, 27, 1, 0.931
3776, 0, 59, 0, 0.929
3776, 0, 59, 1, 0.93
3776, 27, 27, 0, 1.097
3776, 27, 27, 1, 1.097
3776, 59, 59, 0, 0.992
3776, 59, 59, 1, 0.992
3776, 2048, 0, 0, 1.003
3776, 2048, 0, 1, 1.003
3776, 2075, 0, 0, 0.963
3776, 2075, 0, 1, 0.964
3776, 2048, 27, 0, 0.898
3776, 2048, 27, 1, 0.898
3776, 2075, 27, 0, 1.097
3776, 2075, 27, 1, 1.097
3776, 27, 1, 0, 0.998
3776, 27, 1, 1, 0.998
3776, 1, 27, 0, 0.925
3776, 1, 27, 1, 0.925
3776, 59, 1, 0, 0.979
3776, 59, 1, 1, 0.979
3776, 1, 59, 0, 0.894
3776, 1, 59, 1, 0.894
3776, 2075, 1, 0, 0.998
3776, 2075, 1, 1, 0.999
3776, 2049, 27, 0, 0.923
3776, 2049, 27, 1, 0.923
3840, 0, 0, 0, 0.997
3840, 0, 0, 1, 0.997
3840, 28, 0, 0, 0.968
3840, 28, 0, 1, 0.968
3840, 60, 0, 0, 1.001
3840, 60, 0, 1, 1.001
3840, 0, 28, 0, 0.926
3840, 0, 28, 1, 0.927
3840, 0, 60, 0, 0.927
3840, 0, 60, 1, 0.927
3840, 28, 28, 0, 1.094
3840, 28, 28, 1, 1.094
3840, 60, 60, 0, 0.982
3840, 60, 60, 1, 0.982
3840, 2048, 0, 0, 0.998
3840, 2048, 0, 1, 0.998
3840, 2076, 0, 0, 0.968
3840, 2076, 0, 1, 0.968
3840, 2048, 28, 0, 0.896
3840, 2048, 28, 1, 0.896
3840, 2076, 28, 0, 1.094
3840, 2076, 28, 1, 1.094
3840, 28, 1, 0, 0.983
3840, 28, 1, 1, 0.982
3840, 1, 28, 0, 0.916
3840, 1, 28, 1, 0.916
3840, 60, 1, 0, 0.969
3840, 60, 1, 1, 0.969
3840, 1, 60, 0, 0.891
3840, 1, 60, 1, 0.891
3840, 2076, 1, 0, 0.983
3840, 2076, 1, 1, 0.983
3840, 2049, 28, 0, 0.912
3840, 2049, 28, 1, 0.912
3904, 0, 0, 0, 1.002
3904, 0, 0, 1, 1.0
3904, 29, 0, 0, 0.961
3904, 29, 0, 1, 0.961
3904, 61, 0, 0, 0.997
3904, 61, 0, 1, 0.997
3904, 0, 29, 0, 0.915
3904, 0, 29, 1, 0.922
3904, 0, 61, 0, 0.933
3904, 0, 61, 1, 0.933
3904, 29, 29, 0, 1.103
3904, 29, 29, 1, 1.103
3904, 61, 61, 0, 0.995
3904, 61, 61, 1, 0.995
3904, 2048, 0, 0, 0.998
3904, 2048, 0, 1, 1.0
3904, 2077, 0, 0, 0.961
3904, 2077, 0, 1, 0.961
3904, 2048, 29, 0, 0.904
3904, 2048, 29, 1, 0.904
3904, 2077, 29, 0, 1.103
3904, 2077, 29, 1, 1.103
3904, 29, 1, 0, 1.0
3904, 29, 1, 1, 1.0
3904, 1, 29, 0, 0.922
3904, 1, 29, 1, 0.922
3904, 61, 1, 0, 0.98
3904, 61, 1, 1, 0.98
3904, 1, 61, 0, 0.904
3904, 1, 61, 1, 0.904
3904, 2077, 1, 0, 1.0
3904, 2077, 1, 1, 1.0
3904, 2049, 29, 0, 0.919
3904, 2049, 29, 1, 0.919
3968, 0, 0, 0, 1.003
3968, 0, 0, 1, 1.003
3968, 30, 0, 0, 0.969
3968, 30, 0, 1, 0.969
3968, 62, 0, 0, 1.006
3968, 62, 0, 1, 1.006
3968, 0, 30, 0, 0.931
3968, 0, 30, 1, 0.93
3968, 0, 62, 0, 0.929
3968, 0, 62, 1, 0.929
3968, 30, 30, 0, 1.103
3968, 30, 30, 1, 1.103
3968, 62, 62, 0, 0.99
3968, 62, 62, 1, 0.99
3968, 2048, 0, 0, 1.004
3968, 2048, 0, 1, 1.004
3968, 2078, 0, 0, 0.969
3968, 2078, 0, 1, 0.969
3968, 2048, 30, 0, 0.899
3968, 2048, 30, 1, 0.899
3968, 2078, 30, 0, 1.105
3968, 2078, 30, 1, 1.105
3968, 30, 1, 0, 0.993
3968, 30, 1, 1, 0.993
3968, 1, 30, 0, 0.908
3968, 1, 30, 1, 0.908
3968, 62, 1, 0, 0.978
3968, 62, 1, 1, 0.978
3968, 1, 62, 0, 0.895
3968, 1, 62, 1, 0.895
3968, 2078, 1, 0, 0.993
3968, 2078, 1, 1, 0.993
3968, 2049, 30, 0, 0.904
3968, 2049, 30, 1, 0.904
4032, 0, 0, 0, 0.995
4032, 0, 0, 1, 0.995
4032, 31, 0, 0, 0.967
4032, 31, 0, 1, 0.967
4032, 63, 0, 0, 1.002
4032, 63, 0, 1, 1.002
4032, 0, 31, 0, 0.927
4032, 0, 31, 1, 0.926
4032, 0, 63, 0, 0.927
4032, 0, 63, 1, 0.927
4032, 31, 31, 0, 1.09
4032, 31, 31, 1, 1.09
4032, 63, 63, 0, 0.987
4032, 63, 63, 1, 0.987
4032, 2048, 0, 0, 0.995
4032, 2048, 0, 1, 0.995
4032, 2079, 0, 0, 0.967
4032, 2079, 0, 1, 0.967
4032, 2048, 31, 0, 0.897
4032, 2048, 31, 1, 0.897
4032, 2079, 31, 0, 1.09
4032, 2079, 31, 1, 1.09
4032, 31, 1, 0, 0.989
4032, 31, 1, 1, 0.989
4032, 1, 31, 0, 0.911
4032, 1, 31, 1, 0.911
4032, 63, 1, 0, 0.971
4032, 63, 1, 1, 0.972
4032, 1, 63, 0, 0.892
4032, 1, 63, 1, 0.892
4032, 2079, 1, 0, 0.989
4032, 2079, 1, 1, 0.989
4032, 2049, 31, 0, 0.907
4032, 2049, 31, 1, 0.907
4096, 32, 0, 0, 1.014
4096, 32, 0, 1, 1.014
4096, 64, 0, 0, 1.014
4096, 64, 0, 1, 1.014
4096, 0, 32, 0, 1.012
4096, 0, 32, 1, 1.012
4096, 0, 64, 0, 1.012
4096, 0, 64, 1, 1.012
4096, 32, 32, 0, 1.014
4096, 32, 32, 1, 1.014
4096, 64, 64, 0, 1.014
4096, 64, 64, 1, 1.014
4096, 2080, 0, 0, 1.014
4096, 2080, 0, 1, 1.014
4096, 2048, 32, 0, 1.014
4096, 2048, 32, 1, 1.014
4096, 2080, 32, 0, 1.014
4096, 2080, 32, 1, 1.014
4096, 32, 1, 0, 0.975
4096, 32, 1, 1, 0.975
4096, 1, 32, 0, 0.769
4096, 1, 32, 1, 0.769
4096, 64, 1, 0, 0.858
4096, 64, 1, 1, 0.858
4096, 1, 64, 0, 0.769
4096, 1, 64, 1, 0.769
4096, 2080, 1, 0, 0.829
4096, 2080, 1, 1, 0.829
4096, 2049, 32, 0, 0.886
4096, 2049, 32, 1, 0.886
4160, 0, 0, 0, 1.003
4160, 0, 0, 1, 1.003
4160, 33, 0, 0, 1.004
4160, 33, 0, 1, 1.004
4160, 65, 0, 0, 0.999
4160, 65, 0, 1, 0.999
4160, 0, 33, 0, 0.931
4160, 0, 33, 1, 0.931
4160, 0, 65, 0, 0.765
4160, 0, 65, 1, 0.765
4160, 33, 33, 0, 0.998
4160, 33, 33, 1, 0.998
4160, 65, 65, 0, 0.942
4160, 65, 65, 1, 0.942
4160, 2048, 0, 0, 1.003
4160, 2048, 0, 1, 1.003
4160, 2081, 0, 0, 1.004
4160, 2081, 0, 1, 1.004
4160, 2048, 33, 0, 0.899
4160, 2048, 33, 1, 0.898
4160, 2081, 33, 0, 1.002
4160, 2081, 33, 1, 1.002
4160, 33, 1, 0, 1.114
4160, 33, 1, 1, 1.114
4160, 1, 33, 0, 1.01
4160, 1, 33, 1, 1.01
4160, 65, 1, 0, 1.077
4160, 65, 1, 1, 1.077
4160, 1, 65, 0, 0.935
4160, 1, 65, 1, 0.935
4160, 2081, 1, 0, 1.077
4160, 2081, 1, 1, 1.077
4160, 2049, 33, 0, 1.007
4160, 2049, 33, 1, 1.007
4224, 0, 0, 0, 1.014
4224, 0, 0, 1, 1.014
4224, 34, 0, 0, 1.0
4224, 34, 0, 1, 1.0
4224, 66, 0, 0, 1.001
4224, 66, 0, 1, 1.001
4224, 0, 34, 0, 0.928
4224, 0, 34, 1, 0.928
4224, 0, 66, 0, 0.762
4224, 0, 66, 1, 0.762
4224, 34, 34, 0, 0.998
4224, 34, 34, 1, 0.998
4224, 66, 66, 0, 0.959
4224, 66, 66, 1, 0.959
4224, 2048, 0, 0, 1.014
4224, 2048, 0, 1, 1.014
4224, 2082, 0, 0, 1.001
4224, 2082, 0, 1, 1.001
4224, 2048, 34, 0, 0.899
4224, 2048, 34, 1, 0.898
4224, 2082, 34, 0, 0.998
4224, 2082, 34, 1, 0.998
4224, 34, 1, 0, 1.024
4224, 34, 1, 1, 1.023
4224, 1, 34, 0, 0.917
4224, 1, 34, 1, 0.917
4224, 66, 1, 0, 1.012
4224, 66, 1, 1, 1.013
4224, 1, 66, 0, 0.917
4224, 1, 66, 1, 0.917
4224, 2082, 1, 0, 1.022
4224, 2082, 1, 1, 1.022
4224, 2049, 34, 0, 0.914
4224, 2049, 34, 1, 0.914
4288, 0, 0, 0, 0.999
4288, 0, 0, 1, 0.999
4288, 35, 0, 0, 0.995
4288, 35, 0, 1, 0.996
4288, 67, 0, 0, 0.998
4288, 67, 0, 1, 0.998
4288, 0, 35, 0, 0.919
4288, 0, 35, 1, 0.918
4288, 0, 67, 0, 0.767
4288, 0, 67, 1, 0.767
4288, 35, 35, 0, 1.005
4288, 35, 35, 1, 1.004
4288, 67, 67, 0, 0.995
4288, 67, 67, 1, 0.995
4288, 2048, 0, 0, 0.999
4288, 2048, 0, 1, 0.999
4288, 2083, 0, 0, 0.995
4288, 2083, 0, 1, 0.995
4288, 2048, 35, 0, 0.905
4288, 2048, 35, 1, 0.904
4288, 2083, 35, 0, 1.005
4288, 2083, 35, 1, 1.004
4288, 35, 1, 0, 1.033
4288, 35, 1, 1, 1.032
4288, 1, 35, 0, 0.928
4288, 1, 35, 1, 0.928
4288, 67, 1, 0, 1.019
4288, 67, 1, 1, 1.02
4288, 1, 67, 0, 0.925
4288, 1, 67, 1, 0.924
4288, 2083, 1, 0, 1.03
4288, 2083, 1, 1, 1.03
4288, 2049, 35, 0, 0.925
4288, 2049, 35, 1, 0.926
4352, 0, 0, 0, 1.005
4352, 0, 0, 1, 1.005
4352, 36, 0, 0, 1.007
4352, 36, 0, 1, 1.006
4352, 68, 0, 0, 1.007
4352, 68, 0, 1, 1.008
4352, 0, 36, 0, 0.929
4352, 0, 36, 1, 0.929
4352, 0, 68, 0, 0.766
4352, 0, 68, 1, 0.766
4352, 36, 36, 0, 0.998
4352, 36, 36, 1, 0.998
4352, 68, 68, 0, 0.964
4352, 68, 68, 1, 0.964
4352, 2048, 0, 0, 1.006
4352, 2048, 0, 1, 1.006
4352, 2084, 0, 0, 1.006
4352, 2084, 0, 1, 1.006
4352, 2048, 36, 0, 0.897
4352, 2048, 36, 1, 0.898
4352, 2084, 36, 0, 0.998
4352, 2084, 36, 1, 0.998
4352, 36, 1, 0, 1.031
4352, 36, 1, 1, 1.031
4352, 1, 36, 0, 0.924
4352, 1, 36, 1, 0.924
4352, 68, 1, 0, 0.999
4352, 68, 1, 1, 0.999
4352, 1, 68, 0, 0.922
4352, 1, 68, 1, 0.922
4352, 2084, 1, 0, 1.03
4352, 2084, 1, 1, 1.03
4352, 2049, 36, 0, 0.922
4352, 2049, 36, 1, 0.922
4416, 0, 0, 0, 0.997
4416, 0, 0, 1, 0.997
4416, 37, 0, 0, 1.002
4416, 37, 0, 1, 1.002
4416, 69, 0, 0, 1.004
4416, 69, 0, 1, 1.004
4416, 0, 37, 0, 0.928
4416, 0, 37, 1, 0.927
4416, 0, 69, 0, 0.762
4416, 0, 69, 1, 0.762
4416, 37, 37, 0, 0.994
4416, 37, 37, 1, 0.994
4416, 69, 69, 0, 0.959
4416, 69, 69, 1, 0.959
4416, 2048, 0, 0, 0.997
4416, 2048, 0, 1, 0.997
4416, 2085, 0, 0, 1.001
4416, 2085, 0, 1, 1.001
4416, 2048, 37, 0, 0.899
4416, 2048, 37, 1, 0.899
4416, 2085, 37, 0, 0.994
4416, 2085, 37, 1, 0.994
4416, 37, 1, 0, 1.024
4416, 37, 1, 1, 1.023
4416, 1, 37, 0, 0.923
4416, 1, 37, 1, 0.922
4416, 69, 1, 0, 1.009
4416, 69, 1, 1, 1.01
4416, 1, 69, 0, 0.917
4416, 1, 69, 1, 0.917
4416, 2085, 1, 0, 1.024
4416, 2085, 1, 1, 1.024
4416, 2049, 37, 0, 0.919
4416, 2049, 37, 1, 0.919
4480, 0, 0, 0, 1.0
4480, 0, 0, 1, 0.999
4480, 38, 0, 0, 0.996
4480, 38, 0, 1, 0.996
4480, 70, 0, 0, 1.0
4480, 70, 0, 1, 1.0
4480, 0, 38, 0, 0.919
4480, 0, 38, 1, 0.921
4480, 0, 70, 0, 0.767
4480, 0, 70, 1, 0.767
4480, 38, 38, 0, 1.002
4480, 38, 38, 1, 1.002
4480, 70, 70, 0, 0.963
4480, 70, 70, 1, 0.963
4480, 2048, 0, 0, 0.998
4480, 2048, 0, 1, 0.999
4480, 2086, 0, 0, 0.996
4480, 2086, 0, 1, 0.995
4480, 2048, 38, 0, 0.907
4480, 2048, 38, 1, 0.907
4480, 2086, 38, 0, 1.002
4480, 2086, 38, 1, 1.002
4480, 38, 1, 0, 1.032
4480, 38, 1, 1, 1.031
4480, 1, 38, 0, 0.919
4480, 1, 38, 1, 0.92
4480, 70, 1, 0, 1.018
4480, 70, 1, 1, 1.017
4480, 1, 70, 0, 0.916
4480, 1, 70, 1, 0.915
4480, 2086, 1, 0, 1.031
4480, 2086, 1, 1, 1.03
4480, 2049, 38, 0, 0.917
4480, 2049, 38, 1, 0.918
4544, 0, 0, 0, 1.002
4544, 0, 0, 1, 1.002
4544, 39, 0, 0, 1.007
4544, 39, 0, 1, 1.008
4544, 71, 0, 0, 1.002
4544, 71, 0, 1, 1.002
4544, 0, 39, 0, 0.93
4544, 0, 39, 1, 0.931
4544, 0, 71, 0, 0.766
4544, 0, 71, 1, 0.766
4544, 39, 39, 0, 1.001
4544, 39, 39, 1, 1.001
4544, 71, 71, 0, 0.966
4544, 71, 71, 1, 0.966
4544, 2048, 0, 0, 1.002
4544, 2048, 0, 1, 1.002
4544, 2087, 0, 0, 1.008
4544, 2087, 0, 1, 1.007
4544, 2048, 39, 0, 0.901
4544, 2048, 39, 1, 0.901
4544, 2087, 39, 0, 1.001
4544, 2087, 39, 1, 1.001
4544, 39, 1, 0, 1.025
4544, 39, 1, 1, 1.025
4544, 1, 39, 0, 0.919
4544, 1, 39, 1, 0.919
4544, 71, 1, 0, 0.991
4544, 71, 1, 1, 0.991
4544, 1, 71, 0, 0.921
4544, 1, 71, 1, 0.922
4544, 2087, 1, 0, 1.025
4544, 2087, 1, 1, 1.025
4544, 2049, 39, 0, 0.917
4544, 2049, 39, 1, 0.917
4608, 0, 0, 0, 0.997
4608, 0, 0, 1, 0.997
4608, 40, 0, 0, 1.013
4608, 40, 0, 1, 1.013
4608, 72, 0, 0, 1.013
4608, 72, 0, 1, 1.013
4608, 0, 40, 0, 0.925
4608, 0, 40, 1, 0.926
4608, 0, 72, 0, 0.765
4608, 0, 72, 1, 0.765
4608, 40, 40, 0, 1.084
4608, 40, 40, 1, 1.084
4608, 72, 72, 0, 0.966
4608, 72, 72, 1, 0.966
4608, 2048, 0, 0, 0.999
4608, 2048, 0, 1, 0.999
4608, 2088, 0, 0, 1.012
4608, 2088, 0, 1, 1.012
4608, 2048, 40, 0, 0.898
4608, 2048, 40, 1, 0.898
4608, 2088, 40, 0, 1.087
4608, 2088, 40, 1, 1.087
4608, 40, 1, 0, 1.006
4608, 40, 1, 1, 1.006
4608, 1, 40, 0, 0.926
4608, 1, 40, 1, 0.925
4608, 72, 1, 0, 1.012
4608, 72, 1, 1, 1.011
4608, 1, 72, 0, 0.92
4608, 1, 72, 1, 0.92
4608, 2088, 1, 0, 1.006
4608, 2088, 1, 1, 1.006
4608, 2049, 40, 0, 0.923
4608, 2049, 40, 1, 0.923
4672, 0, 0, 0, 1.014
4672, 0, 0, 1, 1.014
4672, 41, 0, 0, 1.003
4672, 41, 0, 1, 1.003
4672, 73, 0, 0, 0.983
4672, 73, 0, 1, 0.982
4672, 0, 41, 0, 0.916
4672, 0, 41, 1, 0.918
4672, 0, 73, 0, 0.772
4672, 0, 73, 1, 0.772
4672, 41, 41, 0, 1.012
4672, 41, 41, 1, 1.012
4672, 73, 73, 0, 0.973
4672, 73, 73, 1, 0.973
4672, 2048, 0, 0, 1.014
4672, 2048, 0, 1, 1.014
4672, 2089, 0, 0, 1.002
4672, 2089, 0, 1, 1.002
4672, 2048, 41, 0, 0.907
4672, 2048, 41, 1, 0.908
4672, 2089, 41, 0, 1.012
4672, 2089, 41, 1, 1.012
4672, 41, 1, 0, 1.027
4672, 41, 1, 1, 1.027
4672, 1, 41, 0, 0.928
4672, 1, 41, 1, 0.927
4672, 73, 1, 0, 1.032
4672, 73, 1, 1, 1.03
4672, 1, 73, 0, 0.927
4672, 1, 73, 1, 0.927
4672, 2089, 1, 0, 1.026
4672, 2089, 1, 1, 1.027
4672, 2049, 41, 0, 0.925
4672, 2049, 41, 1, 0.925
4736, 0, 0, 0, 1.005
4736, 0, 0, 1, 1.005
4736, 42, 0, 0, 1.012
4736, 42, 0, 1, 1.012
4736, 74, 0, 0, 0.976
4736, 74, 0, 1, 0.975
4736, 0, 42, 0, 0.93
4736, 0, 42, 1, 0.93
4736, 0, 74, 0, 0.77
4736, 0, 74, 1, 0.77
4736, 42, 42, 0, 1.007
4736, 42, 42, 1, 1.007
4736, 74, 74, 0, 0.965
4736, 74, 74, 1, 0.965
4736, 2048, 0, 0, 1.006
4736, 2048, 0, 1, 1.006
4736, 2090, 0, 0, 1.013
4736, 2090, 0, 1, 1.013
4736, 2048, 42, 0, 0.902
4736, 2048, 42, 1, 0.902
4736, 2090, 42, 0, 1.007
4736, 2090, 42, 1, 1.007
4736, 42, 1, 0, 1.032
4736, 42, 1, 1, 1.032
4736, 1, 42, 0, 0.925
4736, 1, 42, 1, 0.925
4736, 74, 1, 0, 1.018
4736, 74, 1, 1, 1.018
4736, 1, 74, 0, 0.912
4736, 1, 74, 1, 0.912
4736, 2090, 1, 0, 1.032
4736, 2090, 1, 1, 1.032
4736, 2049, 42, 0, 0.923
4736, 2049, 42, 1, 0.923
4800, 0, 0, 0, 1.012
4800, 0, 0, 1, 1.012
4800, 43, 0, 0, 1.008
4800, 43, 0, 1, 1.008
4800, 75, 0, 0, 0.99
4800, 75, 0, 1, 0.99
4800, 0, 43, 0, 0.928
4800, 0, 43, 1, 0.928
4800, 0, 75, 0, 0.767
4800, 0, 75, 1, 0.768
4800, 43, 43, 0, 1.004
4800, 43, 43, 1, 1.004
4800, 75, 75, 0, 0.965
4800, 75, 75, 1, 0.965
4800, 2048, 0, 0, 1.012
4800, 2048, 0, 1, 1.012
4800, 2091, 0, 0, 1.009
4800, 2091, 0, 1, 1.008
4800, 2048, 43, 0, 0.902
4800, 2048, 43, 1, 0.902
4800, 2091, 43, 0, 1.004
4800, 2091, 43, 1, 1.004
4800, 43, 1, 0, 1.026
4800, 43, 1, 1, 1.025
4800, 1, 43, 0, 0.91
4800, 1, 43, 1, 0.91
4800, 75, 1, 0, 0.992
4800, 75, 1, 1, 0.992
4800, 1, 75, 0, 0.921
4800, 1, 75, 1, 0.92
4800, 2091, 1, 0, 1.025
4800, 2091, 1, 1, 1.025
4800, 2049, 43, 0, 0.907
4800, 2049, 43, 1, 0.907
4864, 0, 0, 0, 0.998
4864, 0, 0, 1, 0.998
4864, 44, 0, 0, 1.003
4864, 44, 0, 1, 1.004
4864, 76, 0, 0, 0.987
4864, 76, 0, 1, 0.987
4864, 0, 44, 0, 0.92
4864, 0, 44, 1, 0.921
4864, 0, 76, 0, 0.933
4864, 0, 76, 1, 0.932
4864, 44, 44, 0, 1.006
4864, 44, 44, 1, 1.004
4864, 76, 76, 0, 0.976
4864, 76, 76, 1, 0.975
4864, 2048, 0, 0, 0.999
4864, 2048, 0, 1, 0.999
4864, 2092, 0, 0, 1.004
4864, 2092, 0, 1, 1.005
4864, 2048, 44, 0, 0.907
4864, 2048, 44, 1, 0.907
4864, 2092, 44, 0, 1.006
4864, 2092, 44, 1, 1.005
4864, 44, 1, 0, 1.034
4864, 44, 1, 1, 1.032
4864, 1, 44, 0, 0.908
4864, 1, 44, 1, 0.929
4864, 76, 1, 0, 1.006
4864, 76, 1, 1, 1.005
4864, 1, 76, 0, 0.798
4864, 1, 76, 1, 0.798
4864, 2092, 1, 0, 1.033
4864, 2092, 1, 1, 1.033
4864, 2049, 44, 0, 0.904
4864, 2049, 44, 1, 0.925
4928, 0, 0, 0, 1.005
4928, 0, 0, 1, 1.005
4928, 45, 0, 0, 0.993
4928, 45, 0, 1, 1.012
4928, 77, 0, 0, 0.956
4928, 77, 0, 1, 0.976
4928, 0, 45, 0, 0.933
4928, 0, 45, 1, 0.932
4928, 0, 77, 0, 0.771
4928, 0, 77, 1, 0.771
4928, 45, 45, 0, 1.015
4928, 45, 45, 1, 1.015
4928, 77, 77, 0, 0.972
4928, 77, 77, 1, 0.972
4928, 2048, 0, 0, 1.005
4928, 2048, 0, 1, 1.005
4928, 2093, 0, 0, 0.992
4928, 2093, 0, 1, 1.012
4928, 2048, 45, 0, 0.932
4928, 2048, 45, 1, 0.931
4928, 2093, 45, 0, 1.015
4928, 2093, 45, 1, 1.015
4928, 45, 1, 0, 1.009
4928, 45, 1, 1, 1.032
4928, 1, 45, 0, 0.806
4928, 1, 45, 1, 0.805
4928, 77, 1, 0, 0.981
4928, 77, 1, 1, 1.005
4928, 1, 77, 0, 0.917
4928, 1, 77, 1, 0.917
4928, 2093, 1, 0, 1.008
4928, 2093, 1, 1, 1.032
4928, 2049, 45, 0, 0.794
4928, 2049, 45, 1, 0.794
4992, 0, 0, 0, 0.999
4992, 0, 0, 1, 0.999
4992, 46, 0, 0, 0.985
4992, 46, 0, 1, 1.008
4992, 78, 0, 0, 0.963
4992, 78, 0, 1, 0.984
4992, 0, 46, 0, 0.908
4992, 0, 46, 1, 0.908
4992, 0, 78, 0, 0.752
4992, 0, 78, 1, 0.751
4992, 46, 46, 0, 0.997
4992, 46, 46, 1, 0.997
4992, 78, 78, 0, 0.969
4992, 78, 78, 1, 0.968
4992, 2048, 0, 0, 1.0
4992, 2048, 0, 1, 1.0
4992, 2094, 0, 0, 0.987
4992, 2094, 0, 1, 1.008
4992, 2048, 46, 0, 0.883
4992, 2048, 46, 1, 0.883
4992, 2094, 46, 0, 0.997
4992, 2094, 46, 1, 0.997
4992, 46, 1, 0, 0.998
4992, 46, 1, 1, 1.02
4992, 1, 46, 0, 0.917
4992, 1, 46, 1, 0.917
4992, 78, 1, 0, 0.972
4992, 78, 1, 1, 0.993
4992, 1, 78, 0, 0.919
4992, 1, 78, 1, 0.92
4992, 2094, 1, 0, 0.997
4992, 2094, 1, 1, 1.019
4992, 2049, 46, 0, 0.914
4992, 2049, 46, 1, 0.914
5056, 0, 0, 0, 1.002
5056, 0, 0, 1, 1.0
5056, 47, 0, 0, 1.005
5056, 47, 0, 1, 1.005
5056, 79, 0, 0, 0.989
5056, 79, 0, 1, 0.989
5056, 0, 47, 0, 0.918
5056, 0, 47, 1, 0.919
5056, 0, 79, 0, 0.772
5056, 0, 79, 1, 0.771
5056, 47, 47, 0, 1.006
5056, 47, 47, 1, 1.006
5056, 79, 79, 0, 0.972
5056, 79, 79, 1, 0.972
5056, 2048, 0, 0, 1.001
5056, 2048, 0, 1, 1.0
5056, 2095, 0, 0, 1.004
5056, 2095, 0, 1, 1.004
5056, 2048, 47, 0, 0.908
5056, 2048, 47, 1, 0.909
5056, 2095, 47, 0, 1.006
5056, 2095, 47, 1, 1.006
5056, 47, 1, 0, 1.033
5056, 47, 1, 1, 1.033
5056, 1, 47, 0, 0.919
5056, 1, 47, 1, 0.919
5056, 79, 1, 0, 1.003
5056, 79, 1, 1, 1.005
5056, 1, 79, 0, 0.921
5056, 1, 79, 1, 0.921
5056, 2095, 1, 0, 1.032
5056, 2095, 1, 1, 1.034
5056, 2049, 47, 0, 0.918
5056, 2049, 47, 1, 0.917
5120, 0, 0, 0, 1.003
5120, 0, 0, 1, 1.003
5120, 48, 0, 0, 1.068
5120, 48, 0, 1, 1.068
5120, 80, 0, 0, 1.068
5120, 80, 0, 1, 1.068
5120, 0, 48, 0, 1.065
5120, 0, 48, 1, 1.065
5120, 0, 80, 0, 1.064
5120, 0, 80, 1, 1.065
5120, 48, 48, 0, 1.004
5120, 48, 48, 1, 1.004
5120, 80, 80, 0, 1.005
5120, 80, 80, 1, 1.005
5120, 2048, 0, 0, 1.005
5120, 2048, 0, 1, 1.005
5120, 2096, 0, 0, 1.068
5120, 2096, 0, 1, 1.068
5120, 2048, 48, 0, 1.065
5120, 2048, 48, 1, 1.065
5120, 2096, 48, 0, 1.005
5120, 2096, 48, 1, 1.005
5120, 48, 1, 0, 1.033
5120, 48, 1, 1, 1.031
5120, 1, 48, 0, 0.898
5120, 1, 48, 1, 0.898
5120, 80, 1, 0, 0.844
5120, 80, 1, 1, 0.844
5120, 1, 80, 0, 0.898
5120, 1, 80, 1, 0.898
5120, 2096, 1, 0, 0.856
5120, 2096, 1, 1, 0.855
5120, 2049, 48, 0, 0.898
5120, 2049, 48, 1, 0.898
bench-memcpy-random:
length, New Time / Old Time
32768, 0.866
65536, 0.891
131072, 0.896
262144, 0.901
524288, 0.904
1048576, 0.913
bench-memcpy-large:
length, align0, align1, dst>src, New Time/Old Time
65543, 0, 0, 0, 0.981
65543, 0, 0, 1, 0.981
65551, 0, 3, 0, 1.012
65551, 0, 3, 1, 1.013
65567, 3, 0, 0, 1.019
65567, 3, 0, 1, 1.02
65599, 3, 5, 0, 1.058
65599, 3, 5, 1, 1.061
65536, 0, 127, 0, 1.046
65536, 0, 127, 1, 1.046
65536, 0, 255, 0, 1.071
65536, 0, 255, 1, 1.071
65536, 0, 256, 0, 0.983
65536, 0, 256, 1, 0.984
65536, 0, 4064, 0, 1.017
65536, 0, 4064, 1, 1.018
131079, 0, 0, 0, 0.981
131079, 0, 0, 1, 0.981
131087, 0, 3, 0, 1.017
131087, 0, 3, 1, 1.017
131103, 3, 0, 0, 1.022
131103, 3, 0, 1, 1.022
131135, 3, 5, 0, 1.064
131135, 3, 5, 1, 1.065
131072, 0, 127, 0, 1.05
131072, 0, 127, 1, 1.05
131072, 0, 255, 0, 1.074
131072, 0, 255, 1, 1.074
131072, 0, 256, 0, 0.984
131072, 0, 256, 1, 0.984
131072, 0, 4064, 0, 1.018
131072, 0, 4064, 1, 1.019
262151, 0, 0, 0, 0.985
262151, 0, 0, 1, 0.985
262159, 0, 3, 0, 1.026
262159, 0, 3, 1, 1.026
262175, 3, 0, 0, 1.03
262175, 3, 0, 1, 1.03
262207, 3, 5, 0, 1.07
262207, 3, 5, 1, 1.07
262144, 0, 127, 0, 1.057
262144, 0, 127, 1, 1.057
262144, 0, 255, 0, 1.079
262144, 0, 255, 1, 1.078
262144, 0, 256, 0, 0.988
262144, 0, 256, 1, 0.988
262144, 0, 4064, 0, 1.02
262144, 0, 4064, 1, 1.02
524295, 0, 0, 0, 0.692
524295, 0, 0, 1, 0.692
524303, 0, 3, 0, 0.736
524303, 0, 3, 1, 0.737
524319, 3, 0, 0, 0.758
524319, 3, 0, 1, 0.759
524351, 3, 5, 0, 0.759
524351, 3, 5, 1, 0.759
524288, 0, 127, 0, 1.057
524288, 0, 127, 1, 1.058
524288, 0, 255, 0, 1.079
524288, 0, 255, 1, 1.079
524288, 0, 256, 0, 0.988
524288, 0, 256, 1, 0.988
524288, 0, 4064, 0, 1.02
524288, 0, 4064, 1, 1.02
1048583, 0, 0, 0, 0.948
1048583, 0, 0, 1, 0.948
1048591, 0, 3, 0, 0.735
1048591, 0, 3, 1, 0.735
1048607, 3, 0, 0, 0.757
1048607, 3, 0, 1, 0.758
1048639, 3, 5, 0, 0.758
1048639, 3, 5, 1, 0.758
1048576, 0, 127, 0, 0.761
1048576, 0, 127, 1, 0.762
1048576, 0, 255, 0, 0.751
1048576, 0, 255, 1, 0.751
1048576, 0, 256, 0, 0.93
1048576, 0, 256, 1, 0.93
1048576, 0, 4064, 0, 0.93
1048576, 0, 4064, 1, 0.93
2097159, 0, 0, 0, 0.928
2097159, 0, 0, 1, 0.931
2097167, 0, 3, 0, 0.735
2097167, 0, 3, 1, 0.734
2097183, 3, 0, 0, 0.759
2097183, 3, 0, 1, 0.759
2097215, 3, 5, 0, 0.758
2097215, 3, 5, 1, 0.757
2097152, 0, 127, 0, 0.77
2097152, 0, 127, 1, 0.77
2097152, 0, 255, 0, 0.745
2097152, 0, 255, 1, 0.745
2097152, 0, 256, 0, 0.924
2097152, 0, 256, 1, 0.925
2097152, 0, 4064, 0, 0.926
2097152, 0, 4064, 1, 0.927
4194311, 0, 0, 0, 0.894
4194311, 0, 0, 1, 0.896
4194319, 0, 3, 0, 0.752
4194319, 0, 3, 1, 0.751
4194335, 3, 0, 0, 0.82
4194335, 3, 0, 1, 0.821
4194367, 3, 5, 0, 0.788
4194367, 3, 5, 1, 0.789
4194304, 0, 127, 0, 0.801
4194304, 0, 127, 1, 0.801
4194304, 0, 255, 0, 0.802
4194304, 0, 255, 1, 0.804
4194304, 0, 256, 0, 0.873
4194304, 0, 256, 1, 0.868
4194304, 0, 4064, 0, 0.955
4194304, 0, 4064, 1, 0.954
8388615, 0, 0, 0, 0.885
8388615, 0, 0, 1, 0.886
8388623, 0, 3, 0, 0.769
8388623, 0, 3, 1, 0.769
8388639, 3, 0, 0, 0.87
8388639, 3, 0, 1, 0.87
8388671, 3, 5, 0, 0.811
8388671, 3, 5, 1, 0.814
8388608, 0, 127, 0, 0.83
8388608, 0, 127, 1, 0.83
8388608, 0, 255, 0, 0.857
8388608, 0, 255, 1, 0.857
8388608, 0, 256, 0, 0.851
8388608, 0, 256, 1, 0.848
8388608, 0, 4064, 0, 0.981
8388608, 0, 4064, 1, 0.981
16777223, 0, 0, 0, 0.885
16777223, 0, 0, 1, 0.886
16777231, 0, 3, 0, 0.769
16777231, 0, 3, 1, 0.768
16777247, 3, 0, 0, 0.87
16777247, 3, 0, 1, 0.87
16777279, 3, 5, 0, 0.811
16777279, 3, 5, 1, 0.814
16777216, 0, 127, 0, 0.831
16777216, 0, 127, 1, 0.83
16777216, 0, 255, 0, 0.857
16777216, 0, 255, 1, 0.857
16777216, 0, 256, 0, 0.852
16777216, 0, 256, 1, 0.848
16777216, 0, 4064, 0, 0.98
16777216, 0, 4064, 1, 0.981
33554439, 0, 0, 0, 0.885
33554439, 0, 0, 1, 0.886
33554447, 0, 3, 0, 0.768
33554447, 0, 3, 1, 0.768
33554463, 3, 0, 0, 0.871
33554463, 3, 0, 1, 0.87
33554495, 3, 5, 0, 0.811
33554495, 3, 5, 1, 0.814
33554432, 0, 127, 0, 0.831
33554432, 0, 127, 1, 0.831
33554432, 0, 255, 0, 0.858
33554432, 0, 255, 1, 0.857
33554432, 0, 256, 0, 0.852
33554432, 0, 256, 1, 0.848
33554432, 0, 4064, 0, 0.98
33554432, 0, 4064, 1, 0.981
sysdeps/x86_64/multiarch/Makefile | 1 -
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ----------------------
sysdeps/x86_64/multiarch/memmove-ssse3.S | 386 ++-
3 files changed, 382 insertions(+), 3156 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..84e4e0f6cb 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,382 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE __memmove_ssse3
+# define MEMMOVE_CHK __memmove_chk_ssse3
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
+#endif
+
+ .section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
+ jmp L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+ movq %rdi, %rax
+L(start):
+ cmpq $16, %rdx
+ jb L(copy_0_15)
+
+ /* These loads are always useful. */
+ movups 0(%rsi), %xmm0
+ movups -16(%rsi, %rdx), %xmm7
+ cmpq $32, %rdx
+ ja L(more_2x_vec)
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm7, -16(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_4x_vec):
+ movups 16(%rsi), %xmm1
+ movups -32(%rsi, %rdx), %xmm2
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm1, 16(%rdi)
+ movups %xmm2, -32(%rdi, %rdx)
+ movups %xmm7, -16(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_0_15):
+ cmpl $8, %edx
+ ja L(copy_9_15)
+
+ cmpl $4, %edx
+ jb L(copy_0_3)
+
+ movl 0(%rsi), %ecx
+ movl -4(%rsi, %rdx), %esi
+ movl %ecx, 0(%rdi)
+ movl %esi, -4(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 8
+L(copy_9_15):
+ movq 0(%rsi), %rcx
+ movq -8(%rsi, %rdx), %rsi
+ movq %rcx, 0(%rdi)
+ movq %rsi, -8(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 4
+L(copy_0_3):
+ cmpl $1, %edx
+ jl L(copy_0_0)
+ movzbl (%rsi), %ecx
+ je L(copy_0_1)
+
+ movzwl -2(%rsi, %rdx), %esi
+ movw %si, -2(%rdi, %rdx)
+L(copy_0_1):
+ movb %cl, (%rdi)
+L(copy_0_0):
+L(nop):
+ ret
+
+ .p2align 4
+L(more_2x_vec):
+ cmpq $64, %rdx
+ jbe L(copy_4x_vec)
+
+ /* We use rcx later to get alignr value. */
+ movq %rdi, %rcx
+
+ /* Backward copy for overlap + dst > src for memmove safety. */
+ subq %rsi, %rcx
+ cmpq %rdx, %rcx
+ jb L(copy_backward)
+
+ /* Load tail. */
+
+ /* -16(%rsi, %rdx) already loaded into xmm7. */
+ movups -32(%rsi, %rdx), %xmm8
+ movups -48(%rsi, %rdx), %xmm9
+
+ /* Get misalignment. */
+ andl $0xf, %ecx
+
+ movq %rsi, %r9
+ addq %rcx, %rsi
+ andq $-16, %rsi
+ /* Get first vec for `palignr`. */
+ movaps (%rsi), %xmm1
+
+ /* We have loaded (%rsi) so safe to do this store before the
+ loop. */
+ movups %xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+ cmp __x86_shared_cache_size_half(%rip), %rdx
+#endif
+ ja L(large_memcpy)
+
+ leaq -64(%rdi, %rdx), %r8
+ andq $-16, %rdi
+ movl $48, %edx
+
+ leaq L(loop_fwd_start)(%rip), %r9
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(copy_backward):
+ testq %rcx, %rcx
+ jz L(nop)
+
+ /* Preload tail. */
+
+ /* (%rsi) already loaded into xmm0. */
+ movups 16(%rsi), %xmm4
+ movups 32(%rsi), %xmm5
+
+ movq %rdi, %r8
+ subq %rdi, %rsi
+ leaq -49(%rdi, %rdx), %rdi
+ andq $-16, %rdi
+ addq %rdi, %rsi
+ andq $-16, %rsi
+
+ movaps 48(%rsi), %xmm6
+
+
+ leaq L(loop_bkwd_start)(%rip), %r9
+ andl $0xf, %ecx
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(large_memcpy):
+ movups -64(%r9, %rdx), %xmm10
+ movups -80(%r9, %rdx), %xmm11
+
+ sall $5, %ecx
+ leal (%rcx, %rcx, 2), %r8d
+ leaq -96(%rdi, %rdx), %rcx
+ andq $-16, %rdi
+ leaq L(large_loop_fwd_start)(%rip), %rdx
+ addq %r8, %rdx
+ jmp * %rdx
+
+
+ /* Instead of a typical jump table all 16 loops are exactly
+ 64-bytes in size. So, we can just jump to first loop + r8 *
+ 64. Before modifying any loop ensure all their sizes match!
+ */
+ .p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps %xmm1, 16(%rdi)
+ movaps %xmm2, 32(%rdi)
+ movaps %xmm3, 48(%rdi)
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+ cmpq %rdi, %r8
+ ja L(loop_fwd_0x0)
+L(end_loop_fwd):
+ movups %xmm9, 16(%r8)
+ movups %xmm8, 32(%r8)
+ movups %xmm7, 48(%r8)
+ ret
+
+ /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_FWD(align_by); \
+ .p2align 6; \
+L(loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm4, %xmm1; \
+ movaps %xmm0, 16(%rdi); \
+ movaps %xmm2, 32(%rdi); \
+ movaps %xmm3, 48(%rdi); \
+ addq %rdx, %rdi; \
+ addq %rdx, %rsi; \
+ cmpq %rdi, %r8; \
+ ja L(loop_fwd_ ## align_by); \
+ jmp L(end_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_FWD (0xf)
+ ALIGNED_LOOP_FWD (0xe)
+ ALIGNED_LOOP_FWD (0xd)
+ ALIGNED_LOOP_FWD (0xc)
+ ALIGNED_LOOP_FWD (0xb)
+ ALIGNED_LOOP_FWD (0xa)
+ ALIGNED_LOOP_FWD (0x9)
+ ALIGNED_LOOP_FWD (0x8)
+ ALIGNED_LOOP_FWD (0x7)
+ ALIGNED_LOOP_FWD (0x6)
+ ALIGNED_LOOP_FWD (0x5)
+ ALIGNED_LOOP_FWD (0x4)
+ ALIGNED_LOOP_FWD (0x3)
+ ALIGNED_LOOP_FWD (0x2)
+ ALIGNED_LOOP_FWD (0x1)
+
+ .p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps 64(%rsi), %xmm4
+ movaps 80(%rsi), %xmm5
+ movntps %xmm1, 16(%rdi)
+ movntps %xmm2, 32(%rdi)
+ movntps %xmm3, 48(%rdi)
+ movntps %xmm4, 64(%rdi)
+ movntps %xmm5, 80(%rdi)
+ addq $80, %rdi
+ addq $80, %rsi
+ cmpq %rdi, %rcx
+ ja L(large_loop_fwd_0x0)
+
+ /* Ensure no icache line split on tail. */
+ .p2align 4
+L(end_large_loop_fwd):
+ sfence
+ movups %xmm11, 16(%rcx)
+ movups %xmm10, 32(%rcx)
+ movups %xmm9, 48(%rcx)
+ movups %xmm8, 64(%rcx)
+ movups %xmm7, 80(%rcx)
+ ret
+
+
+ /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+ 96-byte spacing between each. */
+#define ALIGNED_LARGE_LOOP_FWD(align_by); \
+ .p2align 5; \
+L(large_loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps 64(%rsi), %xmm4; \
+ movaps 80(%rsi), %xmm5; \
+ movaps %xmm5, %xmm6; \
+ palignr $align_by, %xmm4, %xmm5; \
+ palignr $align_by, %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm6, %xmm1; \
+ movntps %xmm0, 16(%rdi); \
+ movntps %xmm2, 32(%rdi); \
+ movntps %xmm3, 48(%rdi); \
+ movntps %xmm4, 64(%rdi); \
+ movntps %xmm5, 80(%rdi); \
+ addq $80, %rdi; \
+ addq $80, %rsi; \
+ cmpq %rdi, %rcx; \
+ ja L(large_loop_fwd_ ## align_by); \
+ jmp L(end_large_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LARGE_LOOP_FWD (0xf)
+ ALIGNED_LARGE_LOOP_FWD (0xe)
+ ALIGNED_LARGE_LOOP_FWD (0xd)
+ ALIGNED_LARGE_LOOP_FWD (0xc)
+ ALIGNED_LARGE_LOOP_FWD (0xb)
+ ALIGNED_LARGE_LOOP_FWD (0xa)
+ ALIGNED_LARGE_LOOP_FWD (0x9)
+ ALIGNED_LARGE_LOOP_FWD (0x8)
+ ALIGNED_LARGE_LOOP_FWD (0x7)
+ ALIGNED_LARGE_LOOP_FWD (0x6)
+ ALIGNED_LARGE_LOOP_FWD (0x5)
+ ALIGNED_LARGE_LOOP_FWD (0x4)
+ ALIGNED_LARGE_LOOP_FWD (0x3)
+ ALIGNED_LARGE_LOOP_FWD (0x2)
+ ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+ .p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+ movaps 32(%rsi), %xmm1
+ movaps 16(%rsi), %xmm2
+ movaps 0(%rsi), %xmm3
+ movaps %xmm1, 32(%rdi)
+ movaps %xmm2, 16(%rdi)
+ movaps %xmm3, 0(%rdi)
+ subq $48, %rdi
+ subq $48, %rsi
+ cmpq %rdi, %r8
+ jb L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+ movups %xmm7, -16(%r8, %rdx)
+ movups %xmm0, 0(%r8)
+ movups %xmm4, 16(%r8)
+ movups %xmm5, 32(%r8)
+
+ ret
+
+
+ /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_BKWD(align_by); \
+ .p2align 6; \
+L(loop_bkwd_ ## align_by): \
+ movaps 32(%rsi), %xmm1; \
+ movaps 16(%rsi), %xmm2; \
+ movaps 0(%rsi), %xmm3; \
+ palignr $align_by, %xmm1, %xmm6; \
+ palignr $align_by, %xmm2, %xmm1; \
+ palignr $align_by, %xmm3, %xmm2; \
+ movaps %xmm6, 32(%rdi); \
+ movaps %xmm1, 16(%rdi); \
+ movaps %xmm2, 0(%rdi); \
+ subq $48, %rdi; \
+ subq $48, %rsi; \
+ movaps %xmm3, %xmm6; \
+ cmpq %rdi, %r8; \
+ jb L(loop_bkwd_ ## align_by); \
+ jmp L(end_loop_bkwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_BKWD (0xf)
+ ALIGNED_LOOP_BKWD (0xe)
+ ALIGNED_LOOP_BKWD (0xd)
+ ALIGNED_LOOP_BKWD (0xc)
+ ALIGNED_LOOP_BKWD (0xb)
+ ALIGNED_LOOP_BKWD (0xa)
+ ALIGNED_LOOP_BKWD (0x9)
+ ALIGNED_LOOP_BKWD (0x8)
+ ALIGNED_LOOP_BKWD (0x7)
+ ALIGNED_LOOP_BKWD (0x6)
+ ALIGNED_LOOP_BKWD (0x5)
+ ALIGNED_LOOP_BKWD (0x4)
+ ALIGNED_LOOP_BKWD (0x3)
+ ALIGNED_LOOP_BKWD (0x2)
+ ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (8 preceding siblings ...)
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
` (5 more replies)
9 siblings, 6 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
5 files changed, 2006 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
memcmp-evex-movbe \
memcmp-sse2 \
memcmp-sse4 \
- memcmp-ssse3 \
memcmpeq-avx2 \
memcmpeq-avx2-rtm \
memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
wmemcmp-c \
wmemcmp-evex-movbe \
wmemcmp-sse4 \
- wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
- __memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
#ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
- __wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
/* Support sysdeps/x86_64/multiarch/wmemset.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_ssse3
-# endif
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
- test %RDX_LP, %RDX_LP
- jz L(equal)
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- mov %rdx, %rcx
- mov %rdi, %rdx
- cmp $48, %rcx;
- jae L(48bytesormore) /* LEN => 48 */
-
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-/* ECX >= 32. */
-L(48bytesormore):
- movdqu (%rdi), %xmm3
- movdqu (%rsi), %xmm0
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %edx
- lea 16(%rdi), %rdi
- lea 16(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(less16bytes)
- mov %edi, %edx
- and $0xf, %edx
- xor %rdx, %rdi
- sub %rdx, %rsi
- add %rdx, %rcx
- mov %esi, %edx
- and $0xf, %edx
- jz L(shr_0)
- xor %rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
- cmp $8, %edx
- jae L(next_unaligned_table)
- cmp $0, %edx
- je L(shr_0)
- cmp $1, %edx
- je L(shr_1)
- cmp $2, %edx
- je L(shr_2)
- cmp $3, %edx
- je L(shr_3)
- cmp $4, %edx
- je L(shr_4)
- cmp $5, %edx
- je L(shr_5)
- cmp $6, %edx
- je L(shr_6)
- jmp L(shr_7)
-
- .p2align 2
-L(next_unaligned_table):
- cmp $8, %edx
- je L(shr_8)
- cmp $9, %edx
- je L(shr_9)
- cmp $10, %edx
- je L(shr_10)
- cmp $11, %edx
- je L(shr_11)
- cmp $12, %edx
- je L(shr_12)
- cmp $13, %edx
- je L(shr_13)
- cmp $14, %edx
- je L(shr_14)
- jmp L(shr_15)
-# else
- cmp $0, %edx
- je L(shr_0)
- cmp $4, %edx
- je L(shr_4)
- cmp $8, %edx
- je L(shr_8)
- jmp L(shr_12)
-# endif
-
- .p2align 4
-L(shr_0):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- jae L(shr_0_gobble)
- xor %eax, %eax
- movdqa (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
- pand %xmm1, %xmm2
- pmovmskb %xmm2, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_0_gobble):
- movdqa (%rsi), %xmm0
- xor %eax, %eax
- pcmpeqb (%rdi), %xmm0
- sub $32, %rcx
- movdqa 16(%rsi), %xmm2
- pcmpeqb 16(%rdi), %xmm2
-L(shr_0_gobble_loop):
- pand %xmm0, %xmm2
- sub $32, %rcx
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- movdqa 32(%rsi), %xmm0
- movdqa 48(%rsi), %xmm2
- sbb $0xffff, %edx
- pcmpeqb 32(%rdi), %xmm0
- pcmpeqb 48(%rdi), %xmm2
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- jz L(shr_0_gobble_loop)
-
- pand %xmm0, %xmm2
- cmp $0, %rcx
- jge L(next)
- inc %edx
- add $32, %rcx
-L(next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm2, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_1):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_1_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $1, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $1, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $1, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_1_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $1, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $1, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $1, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $1, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_1_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_1_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_1_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 1(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-
- .p2align 4
-L(shr_2):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_2_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $2, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $2, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $2, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_2_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $2, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $2, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $2, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $2, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_2_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_2_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_2_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 2(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_3_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $3, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $3, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $3, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_3_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $3, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $3, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $3, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $3, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_3_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_3_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_3_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 3(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_4):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_4_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $4, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $4, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $4, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_4_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $4, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $4, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $4, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $4, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_4_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_4_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_4_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 4(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_5):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_5_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $5, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $5, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $5, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_5_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $5, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $5, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $5, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $5, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_5_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_5_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_5_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 5(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_6_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $6, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $6, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $6, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_6_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $6, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $6, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $6, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $6, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_6_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_6_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_6_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 6(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_7_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $7, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $7, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $7, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_7_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $7, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $7, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $7, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $7, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_7_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_7_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_7_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 7(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_8):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_8_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $8, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $8, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $8, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_8_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $8, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $8, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $8, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $8, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_8_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_8_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_8_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 8(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_9):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_9_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $9, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $9, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $9, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_9_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $9, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $9, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $9, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $9, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_9_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_9_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_9_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 9(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_10_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $10, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $10, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $10, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_10_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $10, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $10, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $10, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $10, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_10_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_10_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_10_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 10(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_11_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $11, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $11, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $11, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_11_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $11, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $11, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $11, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $11, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_11_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_11_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_11_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 11(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# endif
-
- .p2align 4
-L(shr_12):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_12_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $12, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $12, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $12, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_12_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $12, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $12, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $12, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $12, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_12_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_12_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_12_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 12(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
- .p2align 4
-L(shr_13):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_13_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $13, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $13, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $13, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_13_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $13, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $13, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $13, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $13, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_13_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_13_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_13_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 13(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_14_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $14, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $14, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $14, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_14_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $14, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $14, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $14, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $14, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_14_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_14_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_14_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 14(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15):
- cmp $80, %rcx
- lea -48(%rcx), %rcx
- mov %edx, %eax
- jae L(shr_15_gobble)
-
- movdqa 16(%rsi), %xmm1
- movdqa %xmm1, %xmm2
- palignr $15, (%rsi), %xmm1
- pcmpeqb (%rdi), %xmm1
-
- movdqa 32(%rsi), %xmm3
- palignr $15, %xmm2, %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
- pand %xmm1, %xmm3
- pmovmskb %xmm3, %edx
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
- add $15, %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-
- .p2align 4
-L(shr_15_gobble):
- sub $32, %rcx
- movdqa 16(%rsi), %xmm0
- palignr $15, (%rsi), %xmm0
- pcmpeqb (%rdi), %xmm0
-
- movdqa 32(%rsi), %xmm3
- palignr $15, 16(%rsi), %xmm3
- pcmpeqb 16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
- pand %xmm0, %xmm3
- sub $32, %rcx
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
-
- movdqa 64(%rsi), %xmm3
- palignr $15, 48(%rsi), %xmm3
- sbb $0xffff, %edx
- movdqa 48(%rsi), %xmm0
- palignr $15, 32(%rsi), %xmm0
- pcmpeqb 32(%rdi), %xmm0
- lea 32(%rsi), %rsi
- pcmpeqb 48(%rdi), %xmm3
-
- lea 32(%rdi), %rdi
- jz L(shr_15_gobble_loop)
- pand %xmm0, %xmm3
-
- cmp $0, %rcx
- jge L(shr_15_gobble_next)
- inc %edx
- add $32, %rcx
-L(shr_15_gobble_next):
- test %edx, %edx
- jnz L(exit)
-
- pmovmskb %xmm3, %edx
- movdqa %xmm0, %xmm1
- lea 32(%rdi), %rdi
- lea 32(%rsi), %rsi
- sub $0xffff, %edx
- jnz L(exit)
-
- lea 15(%rsi), %rsi
- add %rcx, %rsi
- add %rcx, %rdi
- jmp L(less48bytes)
-# endif
- .p2align 4
-L(exit):
- pmovmskb %xmm1, %r8d
- sub $0xffff, %r8d
- jz L(first16bytes)
- lea -16(%rsi), %rsi
- lea -16(%rdi), %rdi
- mov %r8d, %edx
-L(first16bytes):
- add %rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
- test %dl, %dl
- jz L(next_24_bytes)
-
- test $0x01, %dl
- jnz L(Byte16)
-
- test $0x02, %dl
- jnz L(Byte17)
-
- test $0x04, %dl
- jnz L(Byte18)
-
- test $0x08, %dl
- jnz L(Byte19)
-
- test $0x10, %dl
- jnz L(Byte20)
-
- test $0x20, %dl
- jnz L(Byte21)
-
- test $0x40, %dl
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte16):
- movzbl -16(%rdi), %eax
- movzbl -16(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte17):
- movzbl -15(%rdi), %eax
- movzbl -15(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte18):
- movzbl -14(%rdi), %eax
- movzbl -14(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte19):
- movzbl -13(%rdi), %eax
- movzbl -13(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte20):
- movzbl -12(%rdi), %eax
- movzbl -12(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte21):
- movzbl -11(%rdi), %eax
- movzbl -11(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(Byte22):
- movzbl -10(%rdi), %eax
- movzbl -10(%rsi), %edx
- sub %edx, %eax
- ret
-
- .p2align 4
-L(next_24_bytes):
- lea 8(%rdi), %rdi
- lea 8(%rsi), %rsi
- test $0x01, %dh
- jnz L(Byte16)
-
- test $0x02, %dh
- jnz L(Byte17)
-
- test $0x04, %dh
- jnz L(Byte18)
-
- test $0x08, %dh
- jnz L(Byte19)
-
- test $0x10, %dh
- jnz L(Byte20)
-
- test $0x20, %dh
- jnz L(Byte21)
-
- test $0x40, %dh
- jnz L(Byte22)
-
- movzbl -9(%rdi), %eax
- movzbl -9(%rsi), %edx
- sub %edx, %eax
- ret
-# else
-/* special for wmemcmp */
- xor %eax, %eax
- test %dl, %dl
- jz L(next_two_double_words)
- and $15, %dl
- jz L(second_double_word)
- mov -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(second_double_word):
- mov -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(next_two_double_words):
- and $15, %dh
- jz L(fourth_double_word)
- mov -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
- ret
-
- .p2align 4
-L(fourth_double_word):
- mov -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
- ret
-# endif
-
- .p2align 4
-L(less48bytes):
- cmp $8, %ecx
- jae L(more8bytes)
- cmp $0, %ecx
- je L(0bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $1, %ecx
- je L(1bytes)
- cmp $2, %ecx
- je L(2bytes)
- cmp $3, %ecx
- je L(3bytes)
- cmp $4, %ecx
- je L(4bytes)
- cmp $5, %ecx
- je L(5bytes)
- cmp $6, %ecx
- je L(6bytes)
- jmp L(7bytes)
-# else
- jmp L(4bytes)
-# endif
-
- .p2align 4
-L(more8bytes):
- cmp $16, %ecx
- jae L(more16bytes)
- cmp $8, %ecx
- je L(8bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $9, %ecx
- je L(9bytes)
- cmp $10, %ecx
- je L(10bytes)
- cmp $11, %ecx
- je L(11bytes)
- cmp $12, %ecx
- je L(12bytes)
- cmp $13, %ecx
- je L(13bytes)
- cmp $14, %ecx
- je L(14bytes)
- jmp L(15bytes)
-# else
- jmp L(12bytes)
-# endif
-
- .p2align 4
-L(more16bytes):
- cmp $24, %ecx
- jae L(more24bytes)
- cmp $16, %ecx
- je L(16bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $17, %ecx
- je L(17bytes)
- cmp $18, %ecx
- je L(18bytes)
- cmp $19, %ecx
- je L(19bytes)
- cmp $20, %ecx
- je L(20bytes)
- cmp $21, %ecx
- je L(21bytes)
- cmp $22, %ecx
- je L(22bytes)
- jmp L(23bytes)
-# else
- jmp L(20bytes)
-# endif
-
- .p2align 4
-L(more24bytes):
- cmp $32, %ecx
- jae L(more32bytes)
- cmp $24, %ecx
- je L(24bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $25, %ecx
- je L(25bytes)
- cmp $26, %ecx
- je L(26bytes)
- cmp $27, %ecx
- je L(27bytes)
- cmp $28, %ecx
- je L(28bytes)
- cmp $29, %ecx
- je L(29bytes)
- cmp $30, %ecx
- je L(30bytes)
- jmp L(31bytes)
-# else
- jmp L(28bytes)
-# endif
-
- .p2align 4
-L(more32bytes):
- cmp $40, %ecx
- jae L(more40bytes)
- cmp $32, %ecx
- je L(32bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $33, %ecx
- je L(33bytes)
- cmp $34, %ecx
- je L(34bytes)
- cmp $35, %ecx
- je L(35bytes)
- cmp $36, %ecx
- je L(36bytes)
- cmp $37, %ecx
- je L(37bytes)
- cmp $38, %ecx
- je L(38bytes)
- jmp L(39bytes)
-# else
- jmp L(36bytes)
-# endif
-
- .p2align 4
-L(more40bytes):
- cmp $40, %ecx
- je L(40bytes)
-# ifndef USE_AS_WMEMCMP
- cmp $41, %ecx
- je L(41bytes)
- cmp $42, %ecx
- je L(42bytes)
- cmp $43, %ecx
- je L(43bytes)
- cmp $44, %ecx
- je L(44bytes)
- cmp $45, %ecx
- je L(45bytes)
- cmp $46, %ecx
- je L(46bytes)
- jmp L(47bytes)
-
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- movl -44(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- movl -40(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- movl -36(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- movl -32(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- movl -28(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- movl -24(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- movl -20(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- movl -16(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- movl -12(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- movl -8(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- movl -4(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# else
- .p2align 4
-L(44bytes):
- movl -44(%rdi), %eax
- cmp -44(%rsi), %eax
- jne L(find_diff)
-L(40bytes):
- movl -40(%rdi), %eax
- cmp -40(%rsi), %eax
- jne L(find_diff)
-L(36bytes):
- movl -36(%rdi), %eax
- cmp -36(%rsi), %eax
- jne L(find_diff)
-L(32bytes):
- movl -32(%rdi), %eax
- cmp -32(%rsi), %eax
- jne L(find_diff)
-L(28bytes):
- movl -28(%rdi), %eax
- cmp -28(%rsi), %eax
- jne L(find_diff)
-L(24bytes):
- movl -24(%rdi), %eax
- cmp -24(%rsi), %eax
- jne L(find_diff)
-L(20bytes):
- movl -20(%rdi), %eax
- cmp -20(%rsi), %eax
- jne L(find_diff)
-L(16bytes):
- movl -16(%rdi), %eax
- cmp -16(%rsi), %eax
- jne L(find_diff)
-L(12bytes):
- movl -12(%rdi), %eax
- cmp -12(%rsi), %eax
- jne L(find_diff)
-L(8bytes):
- movl -8(%rdi), %eax
- cmp -8(%rsi), %eax
- jne L(find_diff)
-L(4bytes):
- movl -4(%rdi), %eax
- cmp -4(%rsi), %eax
- jne L(find_diff)
-L(0bytes):
- xor %eax, %eax
- ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
- .p2align 4
-L(45bytes):
- movl -45(%rdi), %eax
- movl -45(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(41bytes):
- movl -41(%rdi), %eax
- movl -41(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(37bytes):
- movl -37(%rdi), %eax
- movl -37(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(33bytes):
- movl -33(%rdi), %eax
- movl -33(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(29bytes):
- movl -29(%rdi), %eax
- movl -29(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(25bytes):
- movl -25(%rdi), %eax
- movl -25(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(21bytes):
- movl -21(%rdi), %eax
- movl -21(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(17bytes):
- movl -17(%rdi), %eax
- movl -17(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(13bytes):
- movl -13(%rdi), %eax
- movl -13(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(9bytes):
- movl -9(%rdi), %eax
- movl -9(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(5bytes):
- movl -5(%rdi), %eax
- movl -5(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(1bytes):
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(46bytes):
- movl -46(%rdi), %eax
- movl -46(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(42bytes):
- movl -42(%rdi), %eax
- movl -42(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(38bytes):
- movl -38(%rdi), %eax
- movl -38(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(34bytes):
- movl -34(%rdi), %eax
- movl -34(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(30bytes):
- movl -30(%rdi), %eax
- movl -30(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(26bytes):
- movl -26(%rdi), %eax
- movl -26(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(22bytes):
- movl -22(%rdi), %eax
- movl -22(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(18bytes):
- movl -18(%rdi), %eax
- movl -18(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(14bytes):
- movl -14(%rdi), %eax
- movl -14(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(10bytes):
- movl -10(%rdi), %eax
- movl -10(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(6bytes):
- movl -6(%rdi), %eax
- movl -6(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(2bytes):
- movzwl -2(%rdi), %eax
- movzwl -2(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(47bytes):
- movl -47(%rdi), %eax
- movl -47(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(43bytes):
- movl -43(%rdi), %eax
- movl -43(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(39bytes):
- movl -39(%rdi), %eax
- movl -39(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(35bytes):
- movl -35(%rdi), %eax
- movl -35(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(31bytes):
- movl -31(%rdi), %eax
- movl -31(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(27bytes):
- movl -27(%rdi), %eax
- movl -27(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(23bytes):
- movl -23(%rdi), %eax
- movl -23(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(19bytes):
- movl -19(%rdi), %eax
- movl -19(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(15bytes):
- movl -15(%rdi), %eax
- movl -15(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(11bytes):
- movl -11(%rdi), %eax
- movl -11(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(7bytes):
- movl -7(%rdi), %eax
- movl -7(%rsi), %ecx
- cmp %ecx, %eax
- jne L(find_diff)
-L(3bytes):
- movzwl -3(%rdi), %eax
- movzwl -3(%rsi), %ecx
- cmpb %cl, %al
- jne L(set)
- cmp %ecx, %eax
- jne L(set)
- movzbl -1(%rdi), %eax
- cmpb -1(%rsi), %al
- jne L(set)
- xor %eax, %eax
- ret
-
- .p2align 4
-L(find_diff):
- cmpb %cl, %al
- jne L(set)
- cmpw %cx, %ax
- jne L(set)
- shr $16, %eax
- shr $16, %ecx
- cmpb %cl, %al
- jne L(set)
-
-/* We get there only if we already know there is a
-difference. */
-
- cmp %ecx, %eax
-L(set):
- sbb %eax, %eax
- sbb $-1, %eax
- ret
-# else
-
-/* for wmemcmp */
- .p2align 4
-L(find_diff):
- mov $1, %eax
- jg L(find_diff_bigger)
- neg %eax
- ret
-
- .p2align 4
-L(find_diff_bigger):
- ret
-# endif
-
- .p2align 4
-L(equal):
- xor %eax, %eax
- ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 18:05 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
` (4 subsequent siblings)
5 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
sysdeps/x86_64/multiarch/strcmp.c | 4 -
sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
sysdeps/x86_64/multiarch/strncmp.c | 4 -
sysdeps/x86_64/strcmp.S | 155 ++++--------------
10 files changed, 30 insertions(+), 202 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
strcasecmp_l-evex \
strcasecmp_l-sse2 \
strcasecmp_l-sse4_2 \
- strcasecmp_l-ssse3 \
strcat-avx2 \
strcat-avx2-rtm \
strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
strcmp-sse2 \
strcmp-sse2-unaligned \
strcmp-sse4_2 \
- strcmp-ssse3 \
strcpy-avx2 \
strcpy-avx2-rtm \
strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
strncase_l-evex \
strncase_l-sse2 \
strncase_l-sse4_2 \
- strncase_l-ssse3 \
strncat-avx2 \
strncat-avx2-rtm \
strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncmp-evex \
strncmp-sse2 \
strncmp-sse4_2 \
- strncmp-ssse3 \
strncpy-avx2 \
strncpy-avx2-rtm \
strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
/* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strcasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strcasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strcasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strcasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
__strcasecmp_l_sse2))
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strcmp_evex)
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
__strcmp_sse42)
- IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
- __strcmp_ssse3)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
__strncasecmp_sse2))
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, strncasecmp_l,
CPU_FEATURE_USABLE (SSE4_2),
__strncasecmp_l_sse42)
- IFUNC_IMPL_ADD (array, i, strncasecmp_l,
- CPU_FEATURE_USABLE (SSSE3),
- __strncasecmp_l_ssse3)
IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
__strncasecmp_l_sse2))
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strncmp_evex)
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
__strncmp_sse42)
- IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
- __strncmp_ssse3)
IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
#include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
- Copyright (C) 2017-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
# include <init-arch.h>
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
return OPTIMIZE (sse42);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
# endif
#endif
-#ifndef USE_SSSE3
.text
-#else
- .section .text.ssse3,"ax",@progbits
-#endif
-
#ifdef USE_AS_STRCASECMP_L
# ifndef ENTRY2
# define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4 /* store for next cycle */
-#ifndef USE_SSSE3
psrldq $1, %xmm3
pslldq $15, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $2, %xmm3
pslldq $14, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $3, %xmm3
pslldq $13, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $4, %xmm3
pslldq $12, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $5, %xmm3
pslldq $11, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $6, %xmm3
pslldq $10, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $7, %xmm3
pslldq $9, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $8, %xmm3
pslldq $8, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $9, %xmm3
pslldq $7, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $10, %xmm3
pslldq $6, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $11, %xmm3
pslldq $5, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $12, %xmm3
pslldq $4, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $13, %xmm3
pslldq $3, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $14, %xmm3
pslldq $2, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
movdqa (%rdi, %rcx), %xmm2
movdqa %xmm2, %xmm4
-#ifndef USE_SSSE3
psrldq $15, %xmm3
pslldq $1, %xmm2
por %xmm3, %xmm2 /* merge into one 16byte value */
-#else
- palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
-#endif
+
TOLOWER (%xmm1, %xmm2)
pcmpeqb %xmm1, %xmm0
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 3/6] x86: Remove str{n}cat-ssse3
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 18:06 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
` (3 subsequent siblings)
5 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
5 files changed, 879 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..2b3c625ea2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -63,7 +63,6 @@ sysdep_routines += \
strcat-evex \
strcat-sse2 \
strcat-sse2-unaligned \
- strcat-ssse3 \
strchr-avx2 \
strchr-avx2-rtm \
strchr-evex \
@@ -101,7 +100,6 @@ sysdep_routines += \
strncat-c \
strncat-evex \
strncat-sse2-unaligned \
- strncat-ssse3 \
strncmp-avx2 \
strncmp-avx2-rtm \
strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..41a04621ad 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcat_evex)
- IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
- __strcat_ssse3)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
@@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncat_evex)
- IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
- __strncat_ssse3)
IFUNC_IMPL_ADD (array, i, strncat, 1,
__strncat_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
return OPTIMIZE (sse2_unaligned);
- if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
- return OPTIMIZE (ssse3);
-
return OPTIMIZE (sse2);
}
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-# define STRCAT __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
- mov %rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
- implementation gets merged. */
-
- xor %eax, %eax
- cmpb $0, (%rdi)
- jz L(exit_tail0)
- cmpb $0, 1(%rdi)
- jz L(exit_tail1)
- cmpb $0, 2(%rdi)
- jz L(exit_tail2)
- cmpb $0, 3(%rdi)
- jz L(exit_tail3)
-
- cmpb $0, 4(%rdi)
- jz L(exit_tail4)
- cmpb $0, 5(%rdi)
- jz L(exit_tail5)
- cmpb $0, 6(%rdi)
- jz L(exit_tail6)
- cmpb $0, 7(%rdi)
- jz L(exit_tail7)
-
- cmpb $0, 8(%rdi)
- jz L(exit_tail8)
- cmpb $0, 9(%rdi)
- jz L(exit_tail9)
- cmpb $0, 10(%rdi)
- jz L(exit_tail10)
- cmpb $0, 11(%rdi)
- jz L(exit_tail11)
-
- cmpb $0, 12(%rdi)
- jz L(exit_tail12)
- cmpb $0, 13(%rdi)
- jz L(exit_tail13)
- cmpb $0, 14(%rdi)
- jz L(exit_tail14)
- cmpb $0, 15(%rdi)
- jz L(exit_tail15)
- pxor %xmm0, %xmm0
- lea 16(%rdi), %rcx
- lea 16(%rdi), %rax
- and $-16, %rax
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- pxor %xmm1, %xmm1
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- pxor %xmm2, %xmm2
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- pxor %xmm3, %xmm3
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm0
- pmovmskb %xmm0, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm1
- pmovmskb %xmm1, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm2
- pmovmskb %xmm2, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- pcmpeqb (%rax), %xmm3
- pmovmskb %xmm3, %edx
- test %edx, %edx
- lea 16(%rax), %rax
- jnz L(exit)
-
- and $-0x40, %rax
-
- .p2align 4
-L(aligned_64):
- pcmpeqb (%rax), %xmm0
- pcmpeqb 16(%rax), %xmm1
- pcmpeqb 32(%rax), %xmm2
- pcmpeqb 48(%rax), %xmm3
- pmovmskb %xmm0, %edx
- pmovmskb %xmm1, %r11d
- pmovmskb %xmm2, %r10d
- pmovmskb %xmm3, %r9d
- or %edx, %r9d
- or %r11d, %r9d
- or %r10d, %r9d
- lea 64(%rax), %rax
- jz L(aligned_64)
-
- test %edx, %edx
- jnz L(aligned_64_exit_16)
- test %r11d, %r11d
- jnz L(aligned_64_exit_32)
- test %r10d, %r10d
- jnz L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
- pmovmskb %xmm3, %edx
- jmp L(exit)
-
-L(aligned_64_exit_48):
- lea -16(%rax), %rax
- mov %r10d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_32):
- lea -32(%rax), %rax
- mov %r11d, %edx
- jmp L(exit)
-
-L(aligned_64_exit_16):
- lea -48(%rax), %rax
-
-L(exit):
- sub %rcx, %rax
- test %dl, %dl
- jz L(exit_high)
- test $0x01, %dl
- jnz L(exit_tail0)
-
- test $0x02, %dl
- jnz L(exit_tail1)
-
- test $0x04, %dl
- jnz L(exit_tail2)
-
- test $0x08, %dl
- jnz L(exit_tail3)
-
- test $0x10, %dl
- jnz L(exit_tail4)
-
- test $0x20, %dl
- jnz L(exit_tail5)
-
- test $0x40, %dl
- jnz L(exit_tail6)
- add $7, %eax
-L(exit_tail0):
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_high):
- add $8, %eax
- test $0x01, %dh
- jnz L(exit_tail0)
-
- test $0x02, %dh
- jnz L(exit_tail1)
-
- test $0x04, %dh
- jnz L(exit_tail2)
-
- test $0x08, %dh
- jnz L(exit_tail3)
-
- test $0x10, %dh
- jnz L(exit_tail4)
-
- test $0x20, %dh
- jnz L(exit_tail5)
-
- test $0x40, %dh
- jnz L(exit_tail6)
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail1):
- add $1, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail2):
- add $2, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail3):
- add $3, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail4):
- add $4, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail5):
- add $5, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail6):
- add $6, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail7):
- add $7, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail8):
- add $8, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail9):
- add $9, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail10):
- add $10, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail11):
- add $11, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail12):
- add $12, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail13):
- add $13, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail14):
- add $14, %eax
- jmp L(StartStrcpyPart)
-
- .p2align 4
-L(exit_tail15):
- add $15, %eax
-
- .p2align 4
-L(StartStrcpyPart):
- mov %rsi, %rcx
- lea (%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
- test %r8, %r8
- jz L(StrncatExit0)
- cmp $8, %r8
- jbe L(StrncatExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- jb L(StrncatExit15Bytes)
-# endif
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# ifdef USE_AS_STRNCAT
- cmp $16, %r8
- je L(StrncatExit16)
-# define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
- .p2align 4
-L(CopyFrom1To16Bytes):
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit1):
- xor %ah, %ah
- movb %ah, 1(%rdx)
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit2):
- xor %ah, %ah
- movb %ah, 2(%rdx)
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit3):
- xor %ah, %ah
- movb %ah, 3(%rdx)
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit4):
- xor %ah, %ah
- movb %ah, 4(%rdx)
-L(Exit4):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit5):
- xor %ah, %ah
- movb %ah, 5(%rdx)
-L(Exit5):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit6):
- xor %ah, %ah
- movb %ah, 6(%rdx)
-L(Exit6):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit7):
- xor %ah, %ah
- movb %ah, 7(%rdx)
-L(Exit7):
- mov (%rcx), %eax
- mov %eax, (%rdx)
- mov 3(%rcx), %eax
- mov %eax, 3(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8):
- xor %ah, %ah
- movb %ah, 8(%rdx)
-L(Exit8):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit9):
- xor %ah, %ah
- movb %ah, 9(%rdx)
-L(Exit9):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movb 8(%rcx), %al
- movb %al, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit10):
- xor %ah, %ah
- movb %ah, 10(%rdx)
-L(Exit10):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movw 8(%rcx), %ax
- movw %ax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit11):
- xor %ah, %ah
- movb %ah, 11(%rdx)
-L(Exit11):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit12):
- xor %ah, %ah
- movb %ah, 12(%rdx)
-L(Exit12):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit13):
- xor %ah, %ah
- movb %ah, 13(%rdx)
-L(Exit13):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 5(%rcx), %xmm1
- movlpd %xmm1, 5(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit14):
- xor %ah, %ah
- movb %ah, 14(%rdx)
-L(Exit14):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 6(%rcx), %xmm1
- movlpd %xmm1, 6(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15):
- xor %ah, %ah
- movb %ah, 15(%rdx)
-L(Exit15):
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit16):
- xor %ah, %ah
- movb %ah, 16(%rdx)
-L(Exit16):
- movlpd (%rcx), %xmm0
- movlpd 8(%rcx), %xmm1
- movlpd %xmm0, (%rdx)
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- test $0x01, %al
- jnz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- test $0x02, %al
- jnz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- test $0x04, %al
- jnz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- test $0x08, %al
- jnz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- test $0x10, %al
- jnz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- test $0x20, %al
- jnz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- test $0x40, %al
- jnz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase2):
- test $0x01, %ah
- jnz L(Exit9)
- cmp $9, %r8
- je L(StrncatExit9)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- test $0x40, %ah
- jnz L(Exit15)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- mov %rdi, %rax
- ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $8, %r8
- ja L(ExitHighCase3)
- cmp $1, %r8
- je L(StrncatExit1)
- cmp $2, %r8
- je L(StrncatExit2)
- cmp $3, %r8
- je L(StrncatExit3)
- cmp $4, %r8
- je L(StrncatExit4)
- cmp $5, %r8
- je L(StrncatExit5)
- cmp $6, %r8
- je L(StrncatExit6)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- xor %ah, %ah
- movb %ah, 8(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(ExitHighCase3):
- cmp $9, %r8
- je L(StrncatExit9)
- cmp $10, %r8
- je L(StrncatExit10)
- cmp $11, %r8
- je L(StrncatExit11)
- cmp $12, %r8
- je L(StrncatExit12)
- cmp $13, %r8
- je L(StrncatExit13)
- cmp $14, %r8
- je L(StrncatExit14)
- cmp $15, %r8
- je L(StrncatExit15)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 8(%rcx), %xmm1
- movlpd %xmm1, 8(%rdx)
- xor %ah, %ah
- movb %ah, 16(%rdx)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit0):
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit15Bytes):
- cmp $9, %r8
- je L(StrncatExit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $10, %r8
- je L(StrncatExit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $11, %r8
- je L(StrncatExit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $12, %r8
- je L(StrncatExit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $13, %r8
- je L(StrncatExit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmp $14, %r8
- je L(StrncatExit14)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- movlpd 7(%rcx), %xmm1
- movlpd %xmm1, 7(%rdx)
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
- .p2align 4
-L(StrncatExit8Bytes):
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $1, %r8
- je L(StrncatExit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $2, %r8
- je L(StrncatExit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $3, %r8
- je L(StrncatExit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $4, %r8
- je L(StrncatExit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $5, %r8
- je L(StrncatExit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $6, %r8
- je L(StrncatExit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmp $7, %r8
- je L(StrncatExit7)
- movlpd (%rcx), %xmm0
- movlpd %xmm0, (%rdx)
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
- xor %cl, %cl
- movb %cl, (%rax)
- mov %rdi, %rax
- ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 18:10 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
` (2 subsequent siblings)
5 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 4 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
6 files changed, 3572 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
stpcpy-evex \
stpcpy-sse2 \
stpcpy-sse2-unaligned \
- stpcpy-ssse3 \
stpncpy-avx2 \
stpncpy-avx2-rtm \
stpncpy-c \
stpncpy-evex \
stpncpy-sse2-unaligned \
- stpncpy-ssse3 \
strcasecmp_l-avx2 \
strcasecmp_l-avx2-rtm \
strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
strcpy-evex \
strcpy-sse2 \
strcpy-sse2-unaligned \
- strcpy-ssse3 \
strcspn-c \
strcspn-sse2 \
strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
strncpy-c \
strncpy-evex \
strncpy-sse2-unaligned \
- strncpy-ssse3 \
strnlen-avx2 \
strnlen-avx2-rtm \
strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
IFUNC_IMPL (i, name, stpncpy,
- IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
- __stpncpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
__stpncpy_avx2)
IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/stpcpy.c. */
IFUNC_IMPL (i, name, stpcpy,
- IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
- __stpcpy_ssse3)
IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
__stpcpy_avx2)
IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strcpy_evex)
- IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
- __strcpy_ssse3)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)),
__strncpy_evex)
- IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
- __strncpy_ssse3)
IFUNC_IMPL_ADD (array, i, strncpy, 1,
__strncpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
- Copyright (C) 2011-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-# include <sysdep.h>
-
-# ifndef STRCPY
-# define STRCPY __strcpy_ssse3
-# endif
-
- .section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
- mov %rsi, %rcx
-# ifdef USE_AS_STRNCPY
- mov %RDX_LP, %R8_LP
-# endif
- mov %rdi, %rdx
-# ifdef USE_AS_STRNCPY
- test %R8_LP, %R8_LP
- jz L(Exit0)
- cmp $8, %R8_LP
- jbe L(StrncpyExit8Bytes)
-# endif
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- cmpb $0, 7(%rcx)
- jz L(Exit8)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- jb L(StrncpyExit15Bytes)
-# endif
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- cmpb $0, 14(%rcx)
- jz L(Exit15)
-# ifdef USE_AS_STRNCPY
- cmp $16, %r8
- je L(Exit16)
-# endif
- cmpb $0, 15(%rcx)
- jz L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
- mov %rcx, %rsi
- sub $16, %r8
- and $0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
- add %rsi, %r8
-# endif
- lea 16(%rcx), %rsi
- and $-16, %rsi
- pxor %xmm0, %xmm0
- mov (%rcx), %r9
- mov %r9, (%rdx)
- pcmpeqb (%rsi), %xmm0
- mov 8(%rcx), %r9
- mov %r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
- pmovmskb %xmm0, %rax
- sub %rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- mov %rdx, %rax
- lea 16(%rdx), %rdx
- and $-16, %rdx
- sub %rdx, %rax
-
-# ifdef USE_AS_STRNCPY
- add %rax, %rsi
- lea -1(%rsi), %rsi
- and $1<<31, %esi
- test %rsi, %rsi
- jnz L(ContinueCopy)
- lea 16(%r8), %r8
-
-L(ContinueCopy):
-# endif
- sub %rax, %rcx
- mov %rcx, %rax
- and $0xf, %rax
- mov $0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
- jz L(Align16Both)
-
- cmp $8, %rax
- jae L(ShlHigh8)
- cmp $1, %rax
- je L(Shl1)
- cmp $2, %rax
- je L(Shl2)
- cmp $3, %rax
- je L(Shl3)
- cmp $4, %rax
- je L(Shl4)
- cmp $5, %rax
- je L(Shl5)
- cmp $6, %rax
- je L(Shl6)
- jmp L(Shl7)
-
-L(ShlHigh8):
- je L(Shl8)
- cmp $9, %rax
- je L(Shl9)
- cmp $10, %rax
- je L(Shl10)
- cmp $11, %rax
- je L(Shl11)
- cmp $12, %rax
- je L(Shl12)
- cmp $13, %rax
- je L(Shl13)
- cmp $14, %rax
- je L(Shl14)
- jmp L(Shl15)
-
-L(Align16Both):
- movaps (%rcx), %xmm1
- movaps 16(%rcx), %xmm2
- movaps %xmm1, (%rdx)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm4
- movaps %xmm3, (%rdx, %rsi)
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm1
- movaps %xmm4, (%rdx, %rsi)
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm2
- movaps %xmm1, (%rdx, %rsi)
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps 16(%rcx, %rsi), %xmm3
- movaps %xmm2, (%rdx, %rsi)
- pcmpeqb %xmm3, %xmm0
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm3, (%rdx, %rsi)
- mov %rcx, %rax
- lea 16(%rcx, %rsi), %rcx
- and $-0x40, %rcx
- sub %rcx, %rax
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- lea 112(%r8, %rax), %r8
-# endif
- mov $-0x40, %rsi
-
- .p2align 4
-L(Aligned64Loop):
- movaps (%rcx), %xmm2
- movaps %xmm2, %xmm4
- movaps 16(%rcx), %xmm5
- movaps 32(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 48(%rcx), %xmm7
- pminub %xmm5, %xmm2
- pminub %xmm7, %xmm3
- pminub %xmm2, %xmm3
- pcmpeqb %xmm0, %xmm3
- pmovmskb %xmm3, %rax
- lea 64(%rdx), %rdx
- lea 64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeaveCase2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Aligned64Leave)
- movaps %xmm4, -64(%rdx)
- movaps %xmm5, -48(%rdx)
- movaps %xmm6, -32(%rdx)
- movaps %xmm7, -16(%rdx)
- jmp L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
- lea 48(%r8), %r8
-# endif
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- test %rax, %rax
- lea 16(%rsi), %rsi
- jnz L(CopyFrom1To16Bytes)
-
- movaps %xmm6, -32(%rdx)
- pcmpeqb %xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
- lea -16(%r8), %r8
-# endif
- pmovmskb %xmm0, %rax
- lea 16(%rsi), %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl1):
- movaps -1(%rcx), %xmm1
- movaps 15(%rcx), %xmm2
-L(Shl1Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit1Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl1LoopExit)
-
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 31(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -15(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -1(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl1LoopStart):
- movaps 15(%rcx), %xmm2
- movaps 31(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 47(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 63(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $1, %xmm4, %xmm5
- test %rax, %rax
- palignr $1, %xmm3, %xmm4
- jnz L(Shl1Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave1)
-# endif
- palignr $1, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $1, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl1LoopStart)
-
-L(Shl1LoopExit):
- movdqu -1(%rcx), %xmm1
- mov $15, %rsi
- movdqu %xmm1, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl2):
- movaps -2(%rcx), %xmm1
- movaps 14(%rcx), %xmm2
-L(Shl2Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit2Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl2LoopExit)
-
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 30(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -14(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -2(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl2LoopStart):
- movaps 14(%rcx), %xmm2
- movaps 30(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 46(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 62(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $2, %xmm4, %xmm5
- test %rax, %rax
- palignr $2, %xmm3, %xmm4
- jnz L(Shl2Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave2)
-# endif
- palignr $2, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $2, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl2LoopStart)
-
-L(Shl2LoopExit):
- movdqu -2(%rcx), %xmm1
- mov $14, %rsi
- movdqu %xmm1, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl3):
- movaps -3(%rcx), %xmm1
- movaps 13(%rcx), %xmm2
-L(Shl3Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit3Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl3LoopExit)
-
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 29(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -13(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -3(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl3LoopStart):
- movaps 13(%rcx), %xmm2
- movaps 29(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 45(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 61(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $3, %xmm4, %xmm5
- test %rax, %rax
- palignr $3, %xmm3, %xmm4
- jnz L(Shl3Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave3)
-# endif
- palignr $3, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $3, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl3LoopStart)
-
-L(Shl3LoopExit):
- movdqu -3(%rcx), %xmm1
- mov $13, %rsi
- movdqu %xmm1, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl4):
- movaps -4(%rcx), %xmm1
- movaps 12(%rcx), %xmm2
-L(Shl4Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit4Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl4LoopExit)
-
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 28(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -12(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -4(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl4LoopStart):
- movaps 12(%rcx), %xmm2
- movaps 28(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 44(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 60(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $4, %xmm4, %xmm5
- test %rax, %rax
- palignr $4, %xmm3, %xmm4
- jnz L(Shl4Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave4)
-# endif
- palignr $4, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $4, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl4LoopStart)
-
-L(Shl4LoopExit):
- movdqu -4(%rcx), %xmm1
- mov $12, %rsi
- movdqu %xmm1, -4(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl5):
- movaps -5(%rcx), %xmm1
- movaps 11(%rcx), %xmm2
-L(Shl5Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit5Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl5LoopExit)
-
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 27(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -11(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -5(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl5LoopStart):
- movaps 11(%rcx), %xmm2
- movaps 27(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 43(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 59(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $5, %xmm4, %xmm5
- test %rax, %rax
- palignr $5, %xmm3, %xmm4
- jnz L(Shl5Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave5)
-# endif
- palignr $5, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $5, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl5LoopStart)
-
-L(Shl5LoopExit):
- movdqu -5(%rcx), %xmm1
- mov $11, %rsi
- movdqu %xmm1, -5(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl6):
- movaps -6(%rcx), %xmm1
- movaps 10(%rcx), %xmm2
-L(Shl6Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit6Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl6LoopExit)
-
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 26(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -10(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -6(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl6LoopStart):
- movaps 10(%rcx), %xmm2
- movaps 26(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 42(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 58(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $6, %xmm4, %xmm5
- test %rax, %rax
- palignr $6, %xmm3, %xmm4
- jnz L(Shl6Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave6)
-# endif
- palignr $6, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $6, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl6LoopStart)
-
-L(Shl6LoopExit):
- mov (%rcx), %r9
- mov 6(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 6(%rdx)
- mov $10, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl7):
- movaps -7(%rcx), %xmm1
- movaps 9(%rcx), %xmm2
-L(Shl7Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit7Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl7LoopExit)
-
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 25(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -9(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -7(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl7LoopStart):
- movaps 9(%rcx), %xmm2
- movaps 25(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 41(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 57(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $7, %xmm4, %xmm5
- test %rax, %rax
- palignr $7, %xmm3, %xmm4
- jnz L(Shl7Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave7)
-# endif
- palignr $7, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $7, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl7LoopStart)
-
-L(Shl7LoopExit):
- mov (%rcx), %r9
- mov 5(%rcx), %esi
- mov %r9, (%rdx)
- mov %esi, 5(%rdx)
- mov $9, %rsi
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl8):
- movaps -8(%rcx), %xmm1
- movaps 8(%rcx), %xmm2
-L(Shl8Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit8Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl8LoopExit)
-
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 24(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -8(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -8(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl8LoopStart):
- movaps 8(%rcx), %xmm2
- movaps 24(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 40(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 56(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $8, %xmm4, %xmm5
- test %rax, %rax
- palignr $8, %xmm3, %xmm4
- jnz L(Shl8Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave8)
-# endif
- palignr $8, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $8, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl8LoopStart)
-
-L(Shl8LoopExit):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl9):
- movaps -9(%rcx), %xmm1
- movaps 7(%rcx), %xmm2
-L(Shl9Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit9Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl9LoopExit)
-
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 23(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -7(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -9(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl9LoopStart):
- movaps 7(%rcx), %xmm2
- movaps 23(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 39(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 55(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $9, %xmm4, %xmm5
- test %rax, %rax
- palignr $9, %xmm3, %xmm4
- jnz L(Shl9Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave9)
-# endif
- palignr $9, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $9, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl9LoopStart)
-
-L(Shl9LoopExit):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl10):
- movaps -10(%rcx), %xmm1
- movaps 6(%rcx), %xmm2
-L(Shl10Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit10Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl10LoopExit)
-
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 22(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -6(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -10(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl10LoopStart):
- movaps 6(%rcx), %xmm2
- movaps 22(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 38(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 54(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $10, %xmm4, %xmm5
- test %rax, %rax
- palignr $10, %xmm3, %xmm4
- jnz L(Shl10Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave10)
-# endif
- palignr $10, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $10, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl10LoopStart)
-
-L(Shl10LoopExit):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl11):
- movaps -11(%rcx), %xmm1
- movaps 5(%rcx), %xmm2
-L(Shl11Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit11Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl11LoopExit)
-
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 21(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -5(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -11(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl11LoopStart):
- movaps 5(%rcx), %xmm2
- movaps 21(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 37(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 53(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $11, %xmm4, %xmm5
- test %rax, %rax
- palignr $11, %xmm3, %xmm4
- jnz L(Shl11Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave11)
-# endif
- palignr $11, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $11, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl11LoopStart)
-
-L(Shl11LoopExit):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl12):
- movaps -12(%rcx), %xmm1
- movaps 4(%rcx), %xmm2
-L(Shl12Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit12Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl12LoopExit)
-
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 20(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -4(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -12(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl12LoopStart):
- movaps 4(%rcx), %xmm2
- movaps 20(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 36(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 52(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $12, %xmm4, %xmm5
- test %rax, %rax
- palignr $12, %xmm3, %xmm4
- jnz L(Shl12Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave12)
-# endif
- palignr $12, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $12, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl12LoopStart)
-
-L(Shl12LoopExit):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl13):
- movaps -13(%rcx), %xmm1
- movaps 3(%rcx), %xmm2
-L(Shl13Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit13Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl13LoopExit)
-
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 19(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -3(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -13(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl13LoopStart):
- movaps 3(%rcx), %xmm2
- movaps 19(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 35(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 51(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $13, %xmm4, %xmm5
- test %rax, %rax
- palignr $13, %xmm3, %xmm4
- jnz L(Shl13Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave13)
-# endif
- palignr $13, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $13, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl13LoopStart)
-
-L(Shl13LoopExit):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl14):
- movaps -14(%rcx), %xmm1
- movaps 2(%rcx), %xmm2
-L(Shl14Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit14Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl14LoopExit)
-
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 18(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -2(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -14(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl14LoopStart):
- movaps 2(%rcx), %xmm2
- movaps 18(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 34(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 50(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $14, %xmm4, %xmm5
- test %rax, %rax
- palignr $14, %xmm3, %xmm4
- jnz L(Shl14Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave14)
-# endif
- palignr $14, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $14, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl14LoopStart)
-
-L(Shl14LoopExit):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- jmp L(CopyFrom1To16Bytes)
-
- .p2align 4
-L(Shl15):
- movaps -15(%rcx), %xmm1
- movaps 1(%rcx), %xmm2
-L(Shl15Start):
- pcmpeqb %xmm2, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
- movaps %xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
-
- pcmpeqb %xmm2, %xmm0
- lea 16(%rdx), %rdx
- pmovmskb %xmm0, %rax
- lea 16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- jbe L(StrncpyExit15Case2OrCase3)
-# endif
- test %rax, %rax
- jnz L(Shl15LoopExit)
-
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, (%rdx)
- lea 17(%rcx), %rcx
- lea 16(%rdx), %rdx
-
- mov %rcx, %rax
- and $-0x40, %rcx
- sub %rcx, %rax
- lea -1(%rcx), %rcx
- sub %rax, %rdx
-# ifdef USE_AS_STRNCPY
- add %rax, %r8
-# endif
- movaps -15(%rcx), %xmm1
-
-/* 64 bytes loop */
- .p2align 4
-L(Shl15LoopStart):
- movaps 1(%rcx), %xmm2
- movaps 17(%rcx), %xmm3
- movaps %xmm3, %xmm6
- movaps 33(%rcx), %xmm4
- movaps %xmm4, %xmm7
- movaps 49(%rcx), %xmm5
- pminub %xmm2, %xmm6
- pminub %xmm5, %xmm7
- pminub %xmm6, %xmm7
- pcmpeqb %xmm0, %xmm7
- pmovmskb %xmm7, %rax
- movaps %xmm5, %xmm7
- palignr $15, %xmm4, %xmm5
- test %rax, %rax
- palignr $15, %xmm3, %xmm4
- jnz L(Shl15Start)
-# ifdef USE_AS_STRNCPY
- sub $64, %r8
- jbe L(StrncpyLeave15)
-# endif
- palignr $15, %xmm2, %xmm3
- lea 64(%rcx), %rcx
- palignr $15, %xmm1, %xmm2
- movaps %xmm7, %xmm1
- movaps %xmm5, 48(%rdx)
- movaps %xmm4, 32(%rdx)
- movaps %xmm3, 16(%rdx)
- movaps %xmm2, (%rdx)
- lea 64(%rdx), %rdx
- jmp L(Shl15LoopStart)
-
-L(Shl15LoopExit):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
- jmp L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
- .p2align 4
-L(CopyFrom1To16Bytes):
-# ifdef USE_AS_STRNCPY
- add $16, %r8
-# endif
- add %rsi, %rdx
- add %rsi, %rcx
-
- test %al, %al
- jz L(ExitHigh)
- test $0x01, %al
- jnz L(Exit1)
- test $0x02, %al
- jnz L(Exit2)
- test $0x04, %al
- jnz L(Exit3)
- test $0x08, %al
- jnz L(Exit4)
- test $0x10, %al
- jnz L(Exit5)
- test $0x20, %al
- jnz L(Exit6)
- test $0x40, %al
- jnz L(Exit7)
-
- .p2align 4
-L(Exit8):
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $8, %r8
- lea 8(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(ExitHigh):
- test $0x01, %ah
- jnz L(Exit9)
- test $0x02, %ah
- jnz L(Exit10)
- test $0x04, %ah
- jnz L(Exit11)
- test $0x08, %ah
- jnz L(Exit12)
- test $0x10, %ah
- jnz L(Exit13)
- test $0x20, %ah
- jnz L(Exit14)
- test $0x40, %ah
- jnz L(Exit15)
-
- .p2align 4
-L(Exit16):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %rax
- mov %rax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 15(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $16, %r8
- lea 16(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
-
- .p2align 4
-L(CopyFrom1To16BytesCase2):
- add $16, %r8
- add %rsi, %rcx
- lea (%rsi, %rdx), %rsi
- lea -9(%r8), %rdx
- and $1<<7, %dh
- or %al, %dh
- test %dh, %dh
- lea (%rsi), %rdx
- jz L(ExitHighCase2)
-
- cmp $1, %r8
- je L(Exit1)
- test $0x01, %al
- jnz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- test $0x02, %al
- jnz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- test $0x04, %al
- jnz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- test $0x08, %al
- jnz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- test $0x10, %al
- jnz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- test $0x20, %al
- jnz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- test $0x40, %al
- jnz L(Exit7)
- jmp L(Exit8)
-
- .p2align 4
-L(ExitHighCase2):
- cmp $9, %r8
- je L(Exit9)
- test $0x01, %ah
- jnz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- test $0x02, %ah
- jnz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- test $0x04, %ah
- jnz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- test $0x8, %ah
- jnz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- test $0x10, %ah
- jnz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- test $0x20, %ah
- jnz L(Exit14)
- cmp $15, %r8
- je L(Exit15)
- test $0x40, %ah
- jnz L(Exit15)
- jmp L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
-
- .p2align 4
-L(CopyFrom1To16BytesCase3):
- add $16, %r8
- add %rsi, %rdx
- add %rsi, %rcx
-
- cmp $16, %r8
- je L(Exit16)
- cmp $8, %r8
- je L(Exit8)
- jg L(More8Case3)
- cmp $4, %r8
- je L(Exit4)
- jg L(More4Case3)
- cmp $2, %r8
- jl L(Exit1)
- je L(Exit2)
- jg L(Exit3)
-L(More8Case3): /* but less than 16 */
- cmp $12, %r8
- je L(Exit12)
- jl L(Less12Case3)
- cmp $14, %r8
- jl L(Exit13)
- je L(Exit14)
- jg L(Exit15)
-L(More4Case3): /* but less than 8 */
- cmp $6, %r8
- jl L(Exit5)
- je L(Exit6)
- jg L(Exit7)
-L(Less12Case3): /* but more than 8 */
- cmp $10, %r8
- jl L(Exit9)
- je L(Exit10)
- jg L(Exit11)
-# endif
-
- .p2align 4
-L(Exit1):
- movb (%rcx), %al
- movb %al, (%rdx)
-# ifdef USE_AS_STPCPY
- lea (%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $1, %r8
- lea 1(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit2):
- movw (%rcx), %ax
- movw %ax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 1(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $2, %r8
- lea 2(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit3):
- movw (%rcx), %ax
- movw %ax, (%rdx)
- movb 2(%rcx), %al
- movb %al, 2(%rdx)
-# ifdef USE_AS_STPCPY
- lea 2(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $3, %r8
- lea 3(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit4):
- movl (%rcx), %eax
- movl %eax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 3(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $4, %r8
- lea 4(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit5):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movb 4(%rcx), %al
- movb %al, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 4(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $5, %r8
- lea 5(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit6):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movw 4(%rcx), %ax
- movw %ax, 4(%rdx)
-# ifdef USE_AS_STPCPY
- lea 5(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $6, %r8
- lea 6(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit7):
- movl (%rcx), %eax
- movl %eax, (%rdx)
- movl 3(%rcx), %eax
- movl %eax, 3(%rdx)
-# ifdef USE_AS_STPCPY
- lea 6(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $7, %r8
- lea 7(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit9):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %eax
- mov %eax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 8(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $9, %r8
- lea 9(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit10):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %eax
- mov %eax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 9(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $10, %r8
- lea 10(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit11):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %eax
- mov %eax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 10(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $11, %r8
- lea 11(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit12):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 8(%rcx), %eax
- mov %eax, 8(%rdx)
-# ifdef USE_AS_STPCPY
- lea 11(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $12, %r8
- lea 12(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit13):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 5(%rcx), %rax
- mov %rax, 5(%rdx)
-# ifdef USE_AS_STPCPY
- lea 12(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $13, %r8
- lea 13(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit14):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 6(%rcx), %rax
- mov %rax, 6(%rdx)
-# ifdef USE_AS_STPCPY
- lea 13(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $14, %r8
- lea 14(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
- .p2align 4
-L(Exit15):
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
-# else
- mov %rdi, %rax
-# endif
-# ifdef USE_AS_STRNCPY
- sub $15, %r8
- lea 15(%rdx), %rcx
- jnz L(StrncpyFillTailWithZero1)
-# ifdef USE_AS_STPCPY
- cmpb $1, (%rax)
- sbb $-1, %rax
-# endif
-# endif
- ret
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(Fill0):
- ret
-
- .p2align 4
-L(Fill1):
- movb %dl, (%rcx)
- ret
-
- .p2align 4
-L(Fill2):
- movw %dx, (%rcx)
- ret
-
- .p2align 4
-L(Fill3):
- movw %dx, (%rcx)
- movb %dl, 2(%rcx)
- ret
-
- .p2align 4
-L(Fill4):
- movl %edx, (%rcx)
- ret
-
- .p2align 4
-L(Fill5):
- movl %edx, (%rcx)
- movb %dl, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill6):
- movl %edx, (%rcx)
- movw %dx, 4(%rcx)
- ret
-
- .p2align 4
-L(Fill7):
- movl %edx, (%rcx)
- movl %edx, 3(%rcx)
- ret
-
- .p2align 4
-L(Fill8):
- mov %rdx, (%rcx)
- ret
-
- .p2align 4
-L(Fill9):
- mov %rdx, (%rcx)
- movb %dl, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill10):
- mov %rdx, (%rcx)
- movw %dx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill11):
- mov %rdx, (%rcx)
- movl %edx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill12):
- mov %rdx, (%rcx)
- movl %edx, 8(%rcx)
- ret
-
- .p2align 4
-L(Fill13):
- mov %rdx, (%rcx)
- mov %rdx, 5(%rcx)
- ret
-
- .p2align 4
-L(Fill14):
- mov %rdx, (%rcx)
- mov %rdx, 6(%rcx)
- ret
-
- .p2align 4
-L(Fill15):
- mov %rdx, (%rcx)
- mov %rdx, 7(%rcx)
- ret
-
- .p2align 4
-L(Fill16):
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
- ret
-
- .p2align 4
-L(StrncpyFillExit1):
- lea 16(%r8), %r8
-L(FillFrom1To16Bytes):
- test %r8, %r8
- jz L(Fill0)
- cmp $16, %r8
- je L(Fill16)
- cmp $8, %r8
- je L(Fill8)
- jg L(FillMore8)
- cmp $4, %r8
- je L(Fill4)
- jg L(FillMore4)
- cmp $2, %r8
- jl L(Fill1)
- je L(Fill2)
- jg L(Fill3)
-L(FillMore8): /* but less than 16 */
- cmp $12, %r8
- je L(Fill12)
- jl L(FillLess12)
- cmp $14, %r8
- jl L(Fill13)
- je L(Fill14)
- jg L(Fill15)
-L(FillMore4): /* but less than 8 */
- cmp $6, %r8
- jl L(Fill5)
- je L(Fill6)
- jg L(Fill7)
-L(FillLess12): /* but more than 8 */
- cmp $10, %r8
- jl L(Fill9)
- je L(Fill10)
- jmp L(Fill11)
-
- .p2align 4
-L(StrncpyFillTailWithZero1):
- xor %rdx, %rdx
- sub $16, %r8
- jbe L(StrncpyFillExit1)
-
- pxor %xmm0, %xmm0
- mov %rdx, (%rcx)
- mov %rdx, 8(%rcx)
-
- lea 16(%rcx), %rcx
-
- mov %rcx, %rdx
- and $0xf, %rdx
- sub %rdx, %rcx
- add %rdx, %r8
- xor %rdx, %rdx
- sub $64, %r8
- jb L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- movdqa %xmm0, 32(%rcx)
- movdqa %xmm0, 48(%rcx)
- lea 64(%rcx), %rcx
- sub $64, %r8
- jae L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
- add $32, %r8
- jl L(StrncpyFillLess32)
- movdqa %xmm0, (%rcx)
- movdqa %xmm0, 16(%rcx)
- lea 32(%rcx), %rcx
- sub $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
- add $16, %r8
- jl L(StrncpyFillExit1)
- movdqa %xmm0, (%rcx)
- lea 16(%rcx), %rcx
- jmp L(FillFrom1To16Bytes)
-
- .p2align 4
-L(Exit0):
- mov %rdx, %rax
- ret
-
- .p2align 4
-L(StrncpyExit15Bytes):
- cmp $9, %r8
- je L(Exit9)
- cmpb $0, 8(%rcx)
- jz L(Exit9)
- cmp $10, %r8
- je L(Exit10)
- cmpb $0, 9(%rcx)
- jz L(Exit10)
- cmp $11, %r8
- je L(Exit11)
- cmpb $0, 10(%rcx)
- jz L(Exit11)
- cmp $12, %r8
- je L(Exit12)
- cmpb $0, 11(%rcx)
- jz L(Exit12)
- cmp $13, %r8
- je L(Exit13)
- cmpb $0, 12(%rcx)
- jz L(Exit13)
- cmp $14, %r8
- je L(Exit14)
- cmpb $0, 13(%rcx)
- jz L(Exit14)
- mov (%rcx), %rax
- mov %rax, (%rdx)
- mov 7(%rcx), %rax
- mov %rax, 7(%rdx)
-# ifdef USE_AS_STPCPY
- lea 14(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
- .p2align 4
-L(StrncpyExit8Bytes):
- cmp $1, %r8
- je L(Exit1)
- cmpb $0, (%rcx)
- jz L(Exit1)
- cmp $2, %r8
- je L(Exit2)
- cmpb $0, 1(%rcx)
- jz L(Exit2)
- cmp $3, %r8
- je L(Exit3)
- cmpb $0, 2(%rcx)
- jz L(Exit3)
- cmp $4, %r8
- je L(Exit4)
- cmpb $0, 3(%rcx)
- jz L(Exit4)
- cmp $5, %r8
- je L(Exit5)
- cmpb $0, 4(%rcx)
- jz L(Exit5)
- cmp $6, %r8
- je L(Exit6)
- cmpb $0, 5(%rcx)
- jz L(Exit6)
- cmp $7, %r8
- je L(Exit7)
- cmpb $0, 6(%rcx)
- jz L(Exit7)
- mov (%rcx), %rax
- mov %rax, (%rdx)
-# ifdef USE_AS_STPCPY
- lea 7(%rdx), %rax
- cmpb $1, (%rax)
- sbb $-1, %rax
-# else
- mov %rdi, %rax
-# endif
- ret
-
-# endif
-# endif
-
-# ifdef USE_AS_STRNCPY
- .p2align 4
-L(StrncpyLeaveCase2OrCase3):
- test %rax, %rax
- jnz L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
- lea 64(%r8), %r8
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase3)
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
- pcmpeqb %xmm4, %xmm0
- pmovmskb %xmm0, %rax
- add $48, %r8
- jle L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm5, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm4, -64(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm6, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm5, -48(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(CopyFrom1To16BytesCase2OrCase3)
- test %rax, %rax
- jnz L(CopyFrom1To16Bytes)
-
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %rax
- movaps %xmm6, -32(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
- jmp L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
- .p2align 4
-L(StrncpyExit1Case2OrCase3):
- movdqu -1(%rcx), %xmm0
- movdqu %xmm0, -1(%rdx)
- mov $15, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit2Case2OrCase3):
- movdqu -2(%rcx), %xmm0
- movdqu %xmm0, -2(%rdx)
- mov $14, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit3Case2OrCase3):
- movdqu -3(%rcx), %xmm0
- movdqu %xmm0, -3(%rdx)
- mov $13, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit4Case2OrCase3):
- movdqu -4(%rcx), %xmm0
- movdqu %xmm0, -4(%rdx)
- mov $12, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit5Case2OrCase3):
- movdqu -5(%rcx), %xmm0
- movdqu %xmm0, -5(%rdx)
- mov $11, %rsi
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit6Case2OrCase3):
- mov (%rcx), %rsi
- mov 6(%rcx), %r9d
- mov %r9d, 6(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $10, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit7Case2OrCase3):
- mov (%rcx), %rsi
- mov 5(%rcx), %r9d
- mov %r9d, 5(%rdx)
- mov %rsi, (%rdx)
- test %rax, %rax
- mov $9, %rsi
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit8Case2OrCase3):
- mov (%rcx), %r9
- mov $8, %rsi
- mov %r9, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit9Case2OrCase3):
- mov -1(%rcx), %r9
- mov $7, %rsi
- mov %r9, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit10Case2OrCase3):
- mov -2(%rcx), %r9
- mov $6, %rsi
- mov %r9, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit11Case2OrCase3):
- mov -3(%rcx), %r9
- mov $5, %rsi
- mov %r9, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit12Case2OrCase3):
- mov (%rcx), %r9d
- mov $4, %rsi
- mov %r9d, (%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit13Case2OrCase3):
- mov -1(%rcx), %r9d
- mov $3, %rsi
- mov %r9d, -1(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit14Case2OrCase3):
- mov -2(%rcx), %r9d
- mov $2, %rsi
- mov %r9d, -2(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyExit15Case2OrCase3):
- mov -3(%rcx), %r9d
- mov $1, %rsi
- mov %r9d, -3(%rdx)
- test %rax, %rax
- jnz L(CopyFrom1To16BytesCase2)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave1):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit1)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 31(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit1)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit1):
- lea 15(%rdx, %rsi), %rdx
- lea 15(%rcx, %rsi), %rcx
- mov -15(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -15(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave2):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit2)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 30(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit2)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit2):
- lea 14(%rdx, %rsi), %rdx
- lea 14(%rcx, %rsi), %rcx
- mov -14(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -14(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave3):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit3)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 29(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit3)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit3):
- lea 13(%rdx, %rsi), %rdx
- lea 13(%rcx, %rsi), %rcx
- mov -13(%rcx), %rsi
- mov -8(%rcx), %rax
- mov %rsi, -13(%rdx)
- mov %rax, -8(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave4):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit4)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 28(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit4)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit4):
- lea 12(%rdx, %rsi), %rdx
- lea 12(%rcx, %rsi), %rcx
- mov -12(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -12(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave5):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit5)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 27(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit5)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit5):
- lea 11(%rdx, %rsi), %rdx
- lea 11(%rcx, %rsi), %rcx
- mov -11(%rcx), %rsi
- mov -4(%rcx), %eax
- mov %rsi, -11(%rdx)
- mov %eax, -4(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave6):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit6)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 26(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit6)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit6):
- lea 10(%rdx, %rsi), %rdx
- lea 10(%rcx, %rsi), %rcx
- mov -10(%rcx), %rsi
- movw -2(%rcx), %ax
- mov %rsi, -10(%rdx)
- movw %ax, -2(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave7):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit7)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 25(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit7)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit7):
- lea 9(%rdx, %rsi), %rdx
- lea 9(%rcx, %rsi), %rcx
- mov -9(%rcx), %rsi
- movb -1(%rcx), %ah
- mov %rsi, -9(%rdx)
- movb %ah, -1(%rdx)
- xor %rsi, %rsi
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave8):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit8)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 24(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit8)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit8):
- lea 8(%rdx, %rsi), %rdx
- lea 8(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave9):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit9)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 23(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit9)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit9):
- lea 7(%rdx, %rsi), %rdx
- lea 7(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave10):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit10)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 22(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit10)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit10):
- lea 6(%rdx, %rsi), %rdx
- lea 6(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave11):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit11)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 21(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit11)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit11):
- lea 5(%rdx, %rsi), %rdx
- lea 5(%rcx, %rsi), %rcx
- mov -8(%rcx), %rax
- xor %rsi, %rsi
- mov %rax, -8(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave12):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit12)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 20(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit12)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit12):
- lea 4(%rdx, %rsi), %rdx
- lea 4(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave13):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit13)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 19(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit13)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit13):
- lea 3(%rdx, %rsi), %rdx
- lea 3(%rcx, %rsi), %rcx
- mov -4(%rcx), %eax
- xor %rsi, %rsi
- mov %eax, -4(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave14):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit14)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 18(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit14)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit14):
- lea 2(%rdx, %rsi), %rdx
- lea 2(%rcx, %rsi), %rcx
- movw -2(%rcx), %ax
- xor %rsi, %rsi
- movw %ax, -2(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
- .p2align 4
-L(StrncpyLeave15):
- movaps %xmm2, %xmm3
- add $48, %r8
- jle L(StrncpyExit15)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdx)
- movaps 17(%rcx), %xmm2
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, 16(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm4, 32(%rdx)
- lea 16(%rsi), %rsi
- sub $16, %r8
- jbe L(StrncpyExit15)
- movaps %xmm5, 48(%rdx)
- lea 16(%rsi), %rsi
- lea -16(%r8), %r8
-
-L(StrncpyExit15):
- lea 1(%rdx, %rsi), %rdx
- lea 1(%rcx, %rsi), %rcx
- movb -1(%rcx), %ah
- xor %rsi, %rsi
- movb %ah, -1(%rdx)
- jmp L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (2 preceding siblings ...)
2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 18:13 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
5 siblings, 1 reply; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +-
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
5 files changed, 6 insertions(+), 3212 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
memcmpeq-evex \
memcmpeq-sse2 \
memcpy-ssse3 \
- memcpy-ssse3-back \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
memmove-evex-unaligned-erms \
memmove-sse2-unaligned-erms \
memmove-ssse3 \
- memmove-ssse3-back \
memrchr-avx2 \
memrchr-avx2-rtm \
memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memmove_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
- __memmove_ssse3_back)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __memcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX512VL),
__memcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
- __memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3)
IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_chk_ssse3_back)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_evex_unaligned_erms)
- IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
- __mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3)
IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
}
}
- if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
- || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (sse2_unaligned_erms);
-
- return OPTIMIZE (sse2_unaligned);
+ return OPTIMIZE (ssse3);
}
- if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
- return OPTIMIZE (ssse3_back);
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (sse2_unaligned_erms);
- return OPTIMIZE (ssse3);
+ return OPTIMIZE (sse2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3_back
-# define MEMCPY_CHK __memcpy_chk_ssse3_back
-# define MEMPCPY __mempcpy_ssse3_back
-# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(bwd_write_0bytes)
- cmp $144, %rdx
- jae L(copy_backward)
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
- cmp $144, %rdx
- jae L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jbe L(bk_write)
-#endif
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
- .p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
- movdqu (%rsi), %xmm0
- mov %rdi, %r8
- and $-16, %rdi
- add $16, %rdi
- mov %rdi, %r9
- sub %r8, %r9
- sub %r9, %rdx
- add %r9, %rsi
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- jae L(gobble_mem_fwd)
- lea L(shl_table_fwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
- mov $DATA_CACHE_SIZE, %RCX_LP
-#else
- mov __x86_data_cache_size(%rip), %RCX_LP
-#endif
- shl $1, %rcx
- cmp %rcx, %rdx
- ja L(gobble_mem_bwd)
-
- add %rdx, %rdi
- add %rdx, %rsi
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $0xf, %r9
- xor %r9, %rdi
- sub %r9, %rsi
- sub %r9, %rdx
- mov %rsi, %r9
- and $0xf, %r9
- jz L(shl_0_bwd)
- lea L(shl_table_bwd)(%rip), %r11
- sub $0x80, %rdx
- movslq (%r11, %r9, 4), %r9
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(shl_0):
-
- mov %rdx, %r9
- shr $8, %r9
- add %rdx, %r9
-#ifdef DATA_CACHE_SIZE
- cmp $DATA_CACHE_SIZE_HALF, %R9_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %R9_LP
-#endif
- jae L(gobble_mem_fwd)
- sub $0x80, %rdx
- .p2align 4
-L(shl_0_loop):
- movdqa (%rsi), %xmm1
- movdqa %xmm1, (%rdi)
- movaps 0x10(%rsi), %xmm2
- movaps %xmm2, 0x10(%rdi)
- movaps 0x20(%rsi), %xmm3
- movaps %xmm3, 0x20(%rdi)
- movaps 0x30(%rsi), %xmm4
- movaps %xmm4, 0x30(%rdi)
- movaps 0x40(%rsi), %xmm1
- movaps %xmm1, 0x40(%rdi)
- movaps 0x50(%rsi), %xmm2
- movaps %xmm2, 0x50(%rdi)
- movaps 0x60(%rsi), %xmm3
- movaps %xmm3, 0x60(%rdi)
- movaps 0x70(%rsi), %xmm4
- movaps %xmm4, 0x70(%rdi)
- sub $0x80, %rdx
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(shl_0_loop)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $0x80, %rdx
-L(copy_backward_loop):
- movaps -0x10(%rsi), %xmm1
- movaps %xmm1, -0x10(%rdi)
- movaps -0x20(%rsi), %xmm2
- movaps %xmm2, -0x20(%rdi)
- movaps -0x30(%rsi), %xmm3
- movaps %xmm3, -0x30(%rdi)
- movaps -0x40(%rsi), %xmm4
- movaps %xmm4, -0x40(%rdi)
- movaps -0x50(%rsi), %xmm5
- movaps %xmm5, -0x50(%rdi)
- movaps -0x60(%rsi), %xmm5
- movaps %xmm5, -0x60(%rdi)
- movaps -0x70(%rsi), %xmm5
- movaps %xmm5, -0x70(%rdi)
- movaps -0x80(%rsi), %xmm5
- movaps %xmm5, -0x80(%rdi)
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(copy_backward_loop)
-
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- sub $0x80, %rdx
- movaps -0x01(%rsi), %xmm1
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movaps 0x4f(%rsi), %xmm6
- movaps 0x5f(%rsi), %xmm7
- movaps 0x6f(%rsi), %xmm8
- movaps 0x7f(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $1, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $1, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $1, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $1, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $1, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $1, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $1, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $1, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_1)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- movaps -0x01(%rsi), %xmm1
-
- movaps -0x11(%rsi), %xmm2
- palignr $1, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x21(%rsi), %xmm3
- palignr $1, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x31(%rsi), %xmm4
- palignr $1, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x41(%rsi), %xmm5
- palignr $1, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x51(%rsi), %xmm6
- palignr $1, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x61(%rsi), %xmm7
- palignr $1, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x71(%rsi), %xmm8
- palignr $1, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x81(%rsi), %xmm9
- palignr $1, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_1_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- sub $0x80, %rdx
- movaps -0x02(%rsi), %xmm1
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movaps 0x4e(%rsi), %xmm6
- movaps 0x5e(%rsi), %xmm7
- movaps 0x6e(%rsi), %xmm8
- movaps 0x7e(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $2, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $2, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $2, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $2, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $2, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $2, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $2, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $2, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_2)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- movaps -0x02(%rsi), %xmm1
-
- movaps -0x12(%rsi), %xmm2
- palignr $2, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x22(%rsi), %xmm3
- palignr $2, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x32(%rsi), %xmm4
- palignr $2, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x42(%rsi), %xmm5
- palignr $2, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x52(%rsi), %xmm6
- palignr $2, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x62(%rsi), %xmm7
- palignr $2, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x72(%rsi), %xmm8
- palignr $2, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x82(%rsi), %xmm9
- palignr $2, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_2_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- sub $0x80, %rdx
- movaps -0x03(%rsi), %xmm1
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movaps 0x4d(%rsi), %xmm6
- movaps 0x5d(%rsi), %xmm7
- movaps 0x6d(%rsi), %xmm8
- movaps 0x7d(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $3, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $3, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $3, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $3, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $3, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $3, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $3, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $3, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_3)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- movaps -0x03(%rsi), %xmm1
-
- movaps -0x13(%rsi), %xmm2
- palignr $3, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x23(%rsi), %xmm3
- palignr $3, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x33(%rsi), %xmm4
- palignr $3, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x43(%rsi), %xmm5
- palignr $3, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x53(%rsi), %xmm6
- palignr $3, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x63(%rsi), %xmm7
- palignr $3, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x73(%rsi), %xmm8
- palignr $3, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x83(%rsi), %xmm9
- palignr $3, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_3_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- sub $0x80, %rdx
- movaps -0x04(%rsi), %xmm1
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movaps 0x4c(%rsi), %xmm6
- movaps 0x5c(%rsi), %xmm7
- movaps 0x6c(%rsi), %xmm8
- movaps 0x7c(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $4, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $4, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $4, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $4, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $4, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $4, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $4, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $4, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_4)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- movaps -0x04(%rsi), %xmm1
-
- movaps -0x14(%rsi), %xmm2
- palignr $4, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x24(%rsi), %xmm3
- palignr $4, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x34(%rsi), %xmm4
- palignr $4, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x44(%rsi), %xmm5
- palignr $4, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x54(%rsi), %xmm6
- palignr $4, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x64(%rsi), %xmm7
- palignr $4, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x74(%rsi), %xmm8
- palignr $4, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x84(%rsi), %xmm9
- palignr $4, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_4_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- sub $0x80, %rdx
- movaps -0x05(%rsi), %xmm1
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movaps 0x4b(%rsi), %xmm6
- movaps 0x5b(%rsi), %xmm7
- movaps 0x6b(%rsi), %xmm8
- movaps 0x7b(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $5, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $5, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $5, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $5, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $5, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $5, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $5, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $5, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_5)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- movaps -0x05(%rsi), %xmm1
-
- movaps -0x15(%rsi), %xmm2
- palignr $5, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x25(%rsi), %xmm3
- palignr $5, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x35(%rsi), %xmm4
- palignr $5, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x45(%rsi), %xmm5
- palignr $5, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x55(%rsi), %xmm6
- palignr $5, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x65(%rsi), %xmm7
- palignr $5, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x75(%rsi), %xmm8
- palignr $5, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x85(%rsi), %xmm9
- palignr $5, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_5_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- sub $0x80, %rdx
- movaps -0x06(%rsi), %xmm1
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movaps 0x4a(%rsi), %xmm6
- movaps 0x5a(%rsi), %xmm7
- movaps 0x6a(%rsi), %xmm8
- movaps 0x7a(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $6, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $6, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $6, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $6, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $6, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $6, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $6, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $6, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_6)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- movaps -0x06(%rsi), %xmm1
-
- movaps -0x16(%rsi), %xmm2
- palignr $6, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x26(%rsi), %xmm3
- palignr $6, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x36(%rsi), %xmm4
- palignr $6, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x46(%rsi), %xmm5
- palignr $6, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x56(%rsi), %xmm6
- palignr $6, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x66(%rsi), %xmm7
- palignr $6, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x76(%rsi), %xmm8
- palignr $6, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x86(%rsi), %xmm9
- palignr $6, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_6_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- sub $0x80, %rdx
- movaps -0x07(%rsi), %xmm1
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movaps 0x49(%rsi), %xmm6
- movaps 0x59(%rsi), %xmm7
- movaps 0x69(%rsi), %xmm8
- movaps 0x79(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $7, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $7, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $7, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $7, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $7, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $7, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $7, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $7, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_7)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- movaps -0x07(%rsi), %xmm1
-
- movaps -0x17(%rsi), %xmm2
- palignr $7, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x27(%rsi), %xmm3
- palignr $7, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x37(%rsi), %xmm4
- palignr $7, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x47(%rsi), %xmm5
- palignr $7, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x57(%rsi), %xmm6
- palignr $7, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x67(%rsi), %xmm7
- palignr $7, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x77(%rsi), %xmm8
- palignr $7, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x87(%rsi), %xmm9
- palignr $7, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_7_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- sub $0x80, %rdx
- movaps -0x08(%rsi), %xmm1
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movaps 0x48(%rsi), %xmm6
- movaps 0x58(%rsi), %xmm7
- movaps 0x68(%rsi), %xmm8
- movaps 0x78(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $8, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $8, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $8, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $8, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $8, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $8, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $8, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $8, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_8)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- movaps -0x08(%rsi), %xmm1
-
- movaps -0x18(%rsi), %xmm2
- palignr $8, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x28(%rsi), %xmm3
- palignr $8, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x38(%rsi), %xmm4
- palignr $8, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x48(%rsi), %xmm5
- palignr $8, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x58(%rsi), %xmm6
- palignr $8, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x68(%rsi), %xmm7
- palignr $8, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x78(%rsi), %xmm8
- palignr $8, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x88(%rsi), %xmm9
- palignr $8, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_8_bwd)
-L(shl_8_end_bwd):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- sub $0x80, %rdx
- movaps -0x09(%rsi), %xmm1
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movaps 0x47(%rsi), %xmm6
- movaps 0x57(%rsi), %xmm7
- movaps 0x67(%rsi), %xmm8
- movaps 0x77(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $9, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $9, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $9, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $9, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $9, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $9, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $9, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $9, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_9)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- movaps -0x09(%rsi), %xmm1
-
- movaps -0x19(%rsi), %xmm2
- palignr $9, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x29(%rsi), %xmm3
- palignr $9, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x39(%rsi), %xmm4
- palignr $9, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x49(%rsi), %xmm5
- palignr $9, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x59(%rsi), %xmm6
- palignr $9, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x69(%rsi), %xmm7
- palignr $9, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x79(%rsi), %xmm8
- palignr $9, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x89(%rsi), %xmm9
- palignr $9, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_9_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- sub $0x80, %rdx
- movaps -0x0a(%rsi), %xmm1
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movaps 0x46(%rsi), %xmm6
- movaps 0x56(%rsi), %xmm7
- movaps 0x66(%rsi), %xmm8
- movaps 0x76(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $10, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $10, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $10, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $10, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $10, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $10, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $10, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $10, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_10)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- movaps -0x0a(%rsi), %xmm1
-
- movaps -0x1a(%rsi), %xmm2
- palignr $10, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2a(%rsi), %xmm3
- palignr $10, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3a(%rsi), %xmm4
- palignr $10, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4a(%rsi), %xmm5
- palignr $10, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5a(%rsi), %xmm6
- palignr $10, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6a(%rsi), %xmm7
- palignr $10, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7a(%rsi), %xmm8
- palignr $10, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8a(%rsi), %xmm9
- palignr $10, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_10_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- sub $0x80, %rdx
- movaps -0x0b(%rsi), %xmm1
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movaps 0x45(%rsi), %xmm6
- movaps 0x55(%rsi), %xmm7
- movaps 0x65(%rsi), %xmm8
- movaps 0x75(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $11, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $11, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $11, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $11, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $11, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $11, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $11, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $11, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_11)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- movaps -0x0b(%rsi), %xmm1
-
- movaps -0x1b(%rsi), %xmm2
- palignr $11, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2b(%rsi), %xmm3
- palignr $11, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3b(%rsi), %xmm4
- palignr $11, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4b(%rsi), %xmm5
- palignr $11, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5b(%rsi), %xmm6
- palignr $11, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6b(%rsi), %xmm7
- palignr $11, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7b(%rsi), %xmm8
- palignr $11, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8b(%rsi), %xmm9
- palignr $11, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_11_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- sub $0x80, %rdx
- movdqa -0x0c(%rsi), %xmm1
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movaps 0x44(%rsi), %xmm6
- movaps 0x54(%rsi), %xmm7
- movaps 0x64(%rsi), %xmm8
- movaps 0x74(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $12, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $12, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $12, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $12, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $12, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $12, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $12, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $12, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
-
- lea 0x80(%rdi), %rdi
- jae L(shl_12)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- movaps -0x0c(%rsi), %xmm1
-
- movaps -0x1c(%rsi), %xmm2
- palignr $12, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2c(%rsi), %xmm3
- palignr $12, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3c(%rsi), %xmm4
- palignr $12, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4c(%rsi), %xmm5
- palignr $12, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5c(%rsi), %xmm6
- palignr $12, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6c(%rsi), %xmm7
- palignr $12, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7c(%rsi), %xmm8
- palignr $12, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8c(%rsi), %xmm9
- palignr $12, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_12_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- sub $0x80, %rdx
- movaps -0x0d(%rsi), %xmm1
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movaps 0x43(%rsi), %xmm6
- movaps 0x53(%rsi), %xmm7
- movaps 0x63(%rsi), %xmm8
- movaps 0x73(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $13, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $13, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $13, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $13, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $13, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $13, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $13, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $13, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_13)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- movaps -0x0d(%rsi), %xmm1
-
- movaps -0x1d(%rsi), %xmm2
- palignr $13, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2d(%rsi), %xmm3
- palignr $13, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3d(%rsi), %xmm4
- palignr $13, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4d(%rsi), %xmm5
- palignr $13, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5d(%rsi), %xmm6
- palignr $13, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6d(%rsi), %xmm7
- palignr $13, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7d(%rsi), %xmm8
- palignr $13, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8d(%rsi), %xmm9
- palignr $13, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_13_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- sub $0x80, %rdx
- movaps -0x0e(%rsi), %xmm1
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movaps 0x42(%rsi), %xmm6
- movaps 0x52(%rsi), %xmm7
- movaps 0x62(%rsi), %xmm8
- movaps 0x72(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $14, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $14, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $14, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $14, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $14, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $14, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $14, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $14, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_14)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- movaps -0x0e(%rsi), %xmm1
-
- movaps -0x1e(%rsi), %xmm2
- palignr $14, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2e(%rsi), %xmm3
- palignr $14, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3e(%rsi), %xmm4
- palignr $14, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4e(%rsi), %xmm5
- palignr $14, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5e(%rsi), %xmm6
- palignr $14, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6e(%rsi), %xmm7
- palignr $14, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7e(%rsi), %xmm8
- palignr $14, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8e(%rsi), %xmm9
- palignr $14, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_14_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- sub $0x80, %rdx
- movaps -0x0f(%rsi), %xmm1
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movaps 0x41(%rsi), %xmm6
- movaps 0x51(%rsi), %xmm7
- movaps 0x61(%rsi), %xmm8
- movaps 0x71(%rsi), %xmm9
- lea 0x80(%rsi), %rsi
- palignr $15, %xmm8, %xmm9
- movaps %xmm9, 0x70(%rdi)
- palignr $15, %xmm7, %xmm8
- movaps %xmm8, 0x60(%rdi)
- palignr $15, %xmm6, %xmm7
- movaps %xmm7, 0x50(%rdi)
- palignr $15, %xmm5, %xmm6
- movaps %xmm6, 0x40(%rdi)
- palignr $15, %xmm4, %xmm5
- movaps %xmm5, 0x30(%rdi)
- palignr $15, %xmm3, %xmm4
- movaps %xmm4, 0x20(%rdi)
- palignr $15, %xmm2, %xmm3
- movaps %xmm3, 0x10(%rdi)
- palignr $15, %xmm1, %xmm2
- movaps %xmm2, (%rdi)
- lea 0x80(%rdi), %rdi
- jae L(shl_15)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- movaps -0x0f(%rsi), %xmm1
-
- movaps -0x1f(%rsi), %xmm2
- palignr $15, %xmm2, %xmm1
- movaps %xmm1, -0x10(%rdi)
-
- movaps -0x2f(%rsi), %xmm3
- palignr $15, %xmm3, %xmm2
- movaps %xmm2, -0x20(%rdi)
-
- movaps -0x3f(%rsi), %xmm4
- palignr $15, %xmm4, %xmm3
- movaps %xmm3, -0x30(%rdi)
-
- movaps -0x4f(%rsi), %xmm5
- palignr $15, %xmm5, %xmm4
- movaps %xmm4, -0x40(%rdi)
-
- movaps -0x5f(%rsi), %xmm6
- palignr $15, %xmm6, %xmm5
- movaps %xmm5, -0x50(%rdi)
-
- movaps -0x6f(%rsi), %xmm7
- palignr $15, %xmm7, %xmm6
- movaps %xmm6, -0x60(%rdi)
-
- movaps -0x7f(%rsi), %xmm8
- palignr $15, %xmm8, %xmm7
- movaps %xmm7, -0x70(%rdi)
-
- movaps -0x8f(%rsi), %xmm9
- palignr $15, %xmm9, %xmm8
- movaps %xmm8, -0x80(%rdi)
-
- sub $0x80, %rdx
- lea -0x80(%rdi), %rdi
- lea -0x80(%rsi), %rsi
- jae L(shl_15_bwd)
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rdi
- sub %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_fwd):
- movdqu (%rsi), %xmm1
- movdqu %xmm0, (%r8)
- movdqa %xmm1, (%rdi)
- sub $16, %rdx
- add $16, %rsi
- add $16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger_in_fwd)
- mov %rdx, %rcx
-L(bigger_in_fwd):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy_fwd)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy_fwd)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy_fwd):
- sub $0x80, %rdx
-L(gobble_mem_fwd_loop):
- sub $0x80, %rdx
- prefetcht0 0x200(%rsi)
- prefetcht0 0x300(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lfence
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_mem_fwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_fwd_end)
- add $0x80, %rdx
-L(ll_cache_copy_fwd):
- add %rcx, %rdx
-L(ll_cache_copy_fwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop_fwd):
- prefetchnta 0x1c0(%rsi)
- prefetchnta 0x280(%rsi)
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x280(%rdi)
- sub $0x80, %rdx
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rsi), %rsi
- lea 0x80(%rdi), %rdi
- jae L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
- add $0x80, %rdx
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
- .p2align 4
-L(gobble_mem_bwd):
- add %rdx, %rsi
- add %rdx, %rdi
-
- movdqu -16(%rsi), %xmm0
- lea -16(%rdi), %r8
- mov %rdi, %r9
- and $-16, %rdi
- sub %rdi, %r9
- sub %r9, %rsi
- sub %r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jbe L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
- cmp %rcx, %rdx
- ja L(bigger)
- mov %rdx, %rcx
-L(bigger):
- sub %rcx, %rdx
- cmp $0x1000, %rdx
- jbe L(ll_cache_copy)
-
- mov %rcx, %r9
- shl $3, %r9
- cmp %r9, %rdx
- jbe L(2steps_copy)
- add %rcx, %rdx
- xor %rcx, %rcx
-L(2steps_copy):
- sub $0x80, %rdx
-L(gobble_mem_bwd_loop):
- sub $0x80, %rdx
- prefetcht0 -0x200(%rsi)
- prefetcht0 -0x300(%rsi)
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- lfence
- movntdq %xmm1, -0x10(%rdi)
- movntdq %xmm2, -0x20(%rdi)
- movntdq %xmm3, -0x30(%rdi)
- movntdq %xmm4, -0x40(%rdi)
- movntdq %xmm5, -0x50(%rdi)
- movntdq %xmm6, -0x60(%rdi)
- movntdq %xmm7, -0x70(%rdi)
- movntdq %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_mem_bwd_loop)
- sfence
- cmp $0x80, %rcx
- jb L(gobble_mem_bwd_end)
- add $0x80, %rdx
-L(ll_cache_copy):
- add %rcx, %rdx
-L(ll_cache_copy_bwd_start):
- sub $0x80, %rdx
-L(gobble_ll_loop):
- prefetchnta -0x1c0(%rsi)
- prefetchnta -0x280(%rsi)
- prefetchnta -0x1c0(%rdi)
- prefetchnta -0x280(%rdi)
- sub $0x80, %rdx
- movdqu -0x10(%rsi), %xmm1
- movdqu -0x20(%rsi), %xmm2
- movdqu -0x30(%rsi), %xmm3
- movdqu -0x40(%rsi), %xmm4
- movdqu -0x50(%rsi), %xmm5
- movdqu -0x60(%rsi), %xmm6
- movdqu -0x70(%rsi), %xmm7
- movdqu -0x80(%rsi), %xmm8
- movdqa %xmm1, -0x10(%rdi)
- movdqa %xmm2, -0x20(%rdi)
- movdqa %xmm3, -0x30(%rdi)
- movdqa %xmm4, -0x40(%rdi)
- movdqa %xmm5, -0x50(%rdi)
- movdqa %xmm6, -0x60(%rdi)
- movdqa %xmm7, -0x70(%rdi)
- movdqa %xmm8, -0x80(%rdi)
- lea -0x80(%rsi), %rsi
- lea -0x80(%rdi), %rdi
- jae L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
- movdqu %xmm0, (%r8)
- add $0x80, %rdx
- sub %rdx, %rsi
- sub %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
- .p2align 4
-L(fwd_write_128bytes):
- lddqu -128(%rsi), %xmm0
- movdqu %xmm0, -128(%rdi)
-L(fwd_write_112bytes):
- lddqu -112(%rsi), %xmm0
- movdqu %xmm0, -112(%rdi)
-L(fwd_write_96bytes):
- lddqu -96(%rsi), %xmm0
- movdqu %xmm0, -96(%rdi)
-L(fwd_write_80bytes):
- lddqu -80(%rsi), %xmm0
- movdqu %xmm0, -80(%rdi)
-L(fwd_write_64bytes):
- lddqu -64(%rsi), %xmm0
- movdqu %xmm0, -64(%rdi)
-L(fwd_write_48bytes):
- lddqu -48(%rsi), %xmm0
- movdqu %xmm0, -48(%rdi)
-L(fwd_write_32bytes):
- lddqu -32(%rsi), %xmm0
- movdqu %xmm0, -32(%rdi)
-L(fwd_write_16bytes):
- lddqu -16(%rsi), %xmm0
- movdqu %xmm0, -16(%rdi)
-L(fwd_write_0bytes):
- ret
-
-
- .p2align 4
-L(fwd_write_143bytes):
- lddqu -143(%rsi), %xmm0
- movdqu %xmm0, -143(%rdi)
-L(fwd_write_127bytes):
- lddqu -127(%rsi), %xmm0
- movdqu %xmm0, -127(%rdi)
-L(fwd_write_111bytes):
- lddqu -111(%rsi), %xmm0
- movdqu %xmm0, -111(%rdi)
-L(fwd_write_95bytes):
- lddqu -95(%rsi), %xmm0
- movdqu %xmm0, -95(%rdi)
-L(fwd_write_79bytes):
- lddqu -79(%rsi), %xmm0
- movdqu %xmm0, -79(%rdi)
-L(fwd_write_63bytes):
- lddqu -63(%rsi), %xmm0
- movdqu %xmm0, -63(%rdi)
-L(fwd_write_47bytes):
- lddqu -47(%rsi), %xmm0
- movdqu %xmm0, -47(%rdi)
-L(fwd_write_31bytes):
- lddqu -31(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -31(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_15bytes):
- mov -15(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -15(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_142bytes):
- lddqu -142(%rsi), %xmm0
- movdqu %xmm0, -142(%rdi)
-L(fwd_write_126bytes):
- lddqu -126(%rsi), %xmm0
- movdqu %xmm0, -126(%rdi)
-L(fwd_write_110bytes):
- lddqu -110(%rsi), %xmm0
- movdqu %xmm0, -110(%rdi)
-L(fwd_write_94bytes):
- lddqu -94(%rsi), %xmm0
- movdqu %xmm0, -94(%rdi)
-L(fwd_write_78bytes):
- lddqu -78(%rsi), %xmm0
- movdqu %xmm0, -78(%rdi)
-L(fwd_write_62bytes):
- lddqu -62(%rsi), %xmm0
- movdqu %xmm0, -62(%rdi)
-L(fwd_write_46bytes):
- lddqu -46(%rsi), %xmm0
- movdqu %xmm0, -46(%rdi)
-L(fwd_write_30bytes):
- lddqu -30(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -30(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_14bytes):
- mov -14(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -14(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_141bytes):
- lddqu -141(%rsi), %xmm0
- movdqu %xmm0, -141(%rdi)
-L(fwd_write_125bytes):
- lddqu -125(%rsi), %xmm0
- movdqu %xmm0, -125(%rdi)
-L(fwd_write_109bytes):
- lddqu -109(%rsi), %xmm0
- movdqu %xmm0, -109(%rdi)
-L(fwd_write_93bytes):
- lddqu -93(%rsi), %xmm0
- movdqu %xmm0, -93(%rdi)
-L(fwd_write_77bytes):
- lddqu -77(%rsi), %xmm0
- movdqu %xmm0, -77(%rdi)
-L(fwd_write_61bytes):
- lddqu -61(%rsi), %xmm0
- movdqu %xmm0, -61(%rdi)
-L(fwd_write_45bytes):
- lddqu -45(%rsi), %xmm0
- movdqu %xmm0, -45(%rdi)
-L(fwd_write_29bytes):
- lddqu -29(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -29(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_13bytes):
- mov -13(%rsi), %rdx
- mov -8(%rsi), %rcx
- mov %rdx, -13(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_140bytes):
- lddqu -140(%rsi), %xmm0
- movdqu %xmm0, -140(%rdi)
-L(fwd_write_124bytes):
- lddqu -124(%rsi), %xmm0
- movdqu %xmm0, -124(%rdi)
-L(fwd_write_108bytes):
- lddqu -108(%rsi), %xmm0
- movdqu %xmm0, -108(%rdi)
-L(fwd_write_92bytes):
- lddqu -92(%rsi), %xmm0
- movdqu %xmm0, -92(%rdi)
-L(fwd_write_76bytes):
- lddqu -76(%rsi), %xmm0
- movdqu %xmm0, -76(%rdi)
-L(fwd_write_60bytes):
- lddqu -60(%rsi), %xmm0
- movdqu %xmm0, -60(%rdi)
-L(fwd_write_44bytes):
- lddqu -44(%rsi), %xmm0
- movdqu %xmm0, -44(%rdi)
-L(fwd_write_28bytes):
- lddqu -28(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -28(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_12bytes):
- mov -12(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -12(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_139bytes):
- lddqu -139(%rsi), %xmm0
- movdqu %xmm0, -139(%rdi)
-L(fwd_write_123bytes):
- lddqu -123(%rsi), %xmm0
- movdqu %xmm0, -123(%rdi)
-L(fwd_write_107bytes):
- lddqu -107(%rsi), %xmm0
- movdqu %xmm0, -107(%rdi)
-L(fwd_write_91bytes):
- lddqu -91(%rsi), %xmm0
- movdqu %xmm0, -91(%rdi)
-L(fwd_write_75bytes):
- lddqu -75(%rsi), %xmm0
- movdqu %xmm0, -75(%rdi)
-L(fwd_write_59bytes):
- lddqu -59(%rsi), %xmm0
- movdqu %xmm0, -59(%rdi)
-L(fwd_write_43bytes):
- lddqu -43(%rsi), %xmm0
- movdqu %xmm0, -43(%rdi)
-L(fwd_write_27bytes):
- lddqu -27(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -27(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_11bytes):
- mov -11(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -11(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_138bytes):
- lddqu -138(%rsi), %xmm0
- movdqu %xmm0, -138(%rdi)
-L(fwd_write_122bytes):
- lddqu -122(%rsi), %xmm0
- movdqu %xmm0, -122(%rdi)
-L(fwd_write_106bytes):
- lddqu -106(%rsi), %xmm0
- movdqu %xmm0, -106(%rdi)
-L(fwd_write_90bytes):
- lddqu -90(%rsi), %xmm0
- movdqu %xmm0, -90(%rdi)
-L(fwd_write_74bytes):
- lddqu -74(%rsi), %xmm0
- movdqu %xmm0, -74(%rdi)
-L(fwd_write_58bytes):
- lddqu -58(%rsi), %xmm0
- movdqu %xmm0, -58(%rdi)
-L(fwd_write_42bytes):
- lddqu -42(%rsi), %xmm0
- movdqu %xmm0, -42(%rdi)
-L(fwd_write_26bytes):
- lddqu -26(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -26(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_10bytes):
- mov -10(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -10(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_137bytes):
- lddqu -137(%rsi), %xmm0
- movdqu %xmm0, -137(%rdi)
-L(fwd_write_121bytes):
- lddqu -121(%rsi), %xmm0
- movdqu %xmm0, -121(%rdi)
-L(fwd_write_105bytes):
- lddqu -105(%rsi), %xmm0
- movdqu %xmm0, -105(%rdi)
-L(fwd_write_89bytes):
- lddqu -89(%rsi), %xmm0
- movdqu %xmm0, -89(%rdi)
-L(fwd_write_73bytes):
- lddqu -73(%rsi), %xmm0
- movdqu %xmm0, -73(%rdi)
-L(fwd_write_57bytes):
- lddqu -57(%rsi), %xmm0
- movdqu %xmm0, -57(%rdi)
-L(fwd_write_41bytes):
- lddqu -41(%rsi), %xmm0
- movdqu %xmm0, -41(%rdi)
-L(fwd_write_25bytes):
- lddqu -25(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -25(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_9bytes):
- mov -9(%rsi), %rdx
- mov -4(%rsi), %ecx
- mov %rdx, -9(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_136bytes):
- lddqu -136(%rsi), %xmm0
- movdqu %xmm0, -136(%rdi)
-L(fwd_write_120bytes):
- lddqu -120(%rsi), %xmm0
- movdqu %xmm0, -120(%rdi)
-L(fwd_write_104bytes):
- lddqu -104(%rsi), %xmm0
- movdqu %xmm0, -104(%rdi)
-L(fwd_write_88bytes):
- lddqu -88(%rsi), %xmm0
- movdqu %xmm0, -88(%rdi)
-L(fwd_write_72bytes):
- lddqu -72(%rsi), %xmm0
- movdqu %xmm0, -72(%rdi)
-L(fwd_write_56bytes):
- lddqu -56(%rsi), %xmm0
- movdqu %xmm0, -56(%rdi)
-L(fwd_write_40bytes):
- lddqu -40(%rsi), %xmm0
- movdqu %xmm0, -40(%rdi)
-L(fwd_write_24bytes):
- lddqu -24(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -24(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_135bytes):
- lddqu -135(%rsi), %xmm0
- movdqu %xmm0, -135(%rdi)
-L(fwd_write_119bytes):
- lddqu -119(%rsi), %xmm0
- movdqu %xmm0, -119(%rdi)
-L(fwd_write_103bytes):
- lddqu -103(%rsi), %xmm0
- movdqu %xmm0, -103(%rdi)
-L(fwd_write_87bytes):
- lddqu -87(%rsi), %xmm0
- movdqu %xmm0, -87(%rdi)
-L(fwd_write_71bytes):
- lddqu -71(%rsi), %xmm0
- movdqu %xmm0, -71(%rdi)
-L(fwd_write_55bytes):
- lddqu -55(%rsi), %xmm0
- movdqu %xmm0, -55(%rdi)
-L(fwd_write_39bytes):
- lddqu -39(%rsi), %xmm0
- movdqu %xmm0, -39(%rdi)
-L(fwd_write_23bytes):
- lddqu -23(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -23(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_134bytes):
- lddqu -134(%rsi), %xmm0
- movdqu %xmm0, -134(%rdi)
-L(fwd_write_118bytes):
- lddqu -118(%rsi), %xmm0
- movdqu %xmm0, -118(%rdi)
-L(fwd_write_102bytes):
- lddqu -102(%rsi), %xmm0
- movdqu %xmm0, -102(%rdi)
-L(fwd_write_86bytes):
- lddqu -86(%rsi), %xmm0
- movdqu %xmm0, -86(%rdi)
-L(fwd_write_70bytes):
- lddqu -70(%rsi), %xmm0
- movdqu %xmm0, -70(%rdi)
-L(fwd_write_54bytes):
- lddqu -54(%rsi), %xmm0
- movdqu %xmm0, -54(%rdi)
-L(fwd_write_38bytes):
- lddqu -38(%rsi), %xmm0
- movdqu %xmm0, -38(%rdi)
-L(fwd_write_22bytes):
- lddqu -22(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -22(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_133bytes):
- lddqu -133(%rsi), %xmm0
- movdqu %xmm0, -133(%rdi)
-L(fwd_write_117bytes):
- lddqu -117(%rsi), %xmm0
- movdqu %xmm0, -117(%rdi)
-L(fwd_write_101bytes):
- lddqu -101(%rsi), %xmm0
- movdqu %xmm0, -101(%rdi)
-L(fwd_write_85bytes):
- lddqu -85(%rsi), %xmm0
- movdqu %xmm0, -85(%rdi)
-L(fwd_write_69bytes):
- lddqu -69(%rsi), %xmm0
- movdqu %xmm0, -69(%rdi)
-L(fwd_write_53bytes):
- lddqu -53(%rsi), %xmm0
- movdqu %xmm0, -53(%rdi)
-L(fwd_write_37bytes):
- lddqu -37(%rsi), %xmm0
- movdqu %xmm0, -37(%rdi)
-L(fwd_write_21bytes):
- lddqu -21(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -21(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_132bytes):
- lddqu -132(%rsi), %xmm0
- movdqu %xmm0, -132(%rdi)
-L(fwd_write_116bytes):
- lddqu -116(%rsi), %xmm0
- movdqu %xmm0, -116(%rdi)
-L(fwd_write_100bytes):
- lddqu -100(%rsi), %xmm0
- movdqu %xmm0, -100(%rdi)
-L(fwd_write_84bytes):
- lddqu -84(%rsi), %xmm0
- movdqu %xmm0, -84(%rdi)
-L(fwd_write_68bytes):
- lddqu -68(%rsi), %xmm0
- movdqu %xmm0, -68(%rdi)
-L(fwd_write_52bytes):
- lddqu -52(%rsi), %xmm0
- movdqu %xmm0, -52(%rdi)
-L(fwd_write_36bytes):
- lddqu -36(%rsi), %xmm0
- movdqu %xmm0, -36(%rdi)
-L(fwd_write_20bytes):
- lddqu -20(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -20(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_131bytes):
- lddqu -131(%rsi), %xmm0
- movdqu %xmm0, -131(%rdi)
-L(fwd_write_115bytes):
- lddqu -115(%rsi), %xmm0
- movdqu %xmm0, -115(%rdi)
-L(fwd_write_99bytes):
- lddqu -99(%rsi), %xmm0
- movdqu %xmm0, -99(%rdi)
-L(fwd_write_83bytes):
- lddqu -83(%rsi), %xmm0
- movdqu %xmm0, -83(%rdi)
-L(fwd_write_67bytes):
- lddqu -67(%rsi), %xmm0
- movdqu %xmm0, -67(%rdi)
-L(fwd_write_51bytes):
- lddqu -51(%rsi), %xmm0
- movdqu %xmm0, -51(%rdi)
-L(fwd_write_35bytes):
- lddqu -35(%rsi), %xmm0
- movdqu %xmm0, -35(%rdi)
-L(fwd_write_19bytes):
- lddqu -19(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -19(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_130bytes):
- lddqu -130(%rsi), %xmm0
- movdqu %xmm0, -130(%rdi)
-L(fwd_write_114bytes):
- lddqu -114(%rsi), %xmm0
- movdqu %xmm0, -114(%rdi)
-L(fwd_write_98bytes):
- lddqu -98(%rsi), %xmm0
- movdqu %xmm0, -98(%rdi)
-L(fwd_write_82bytes):
- lddqu -82(%rsi), %xmm0
- movdqu %xmm0, -82(%rdi)
-L(fwd_write_66bytes):
- lddqu -66(%rsi), %xmm0
- movdqu %xmm0, -66(%rdi)
-L(fwd_write_50bytes):
- lddqu -50(%rsi), %xmm0
- movdqu %xmm0, -50(%rdi)
-L(fwd_write_34bytes):
- lddqu -34(%rsi), %xmm0
- movdqu %xmm0, -34(%rdi)
-L(fwd_write_18bytes):
- lddqu -18(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -18(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_2bytes):
- movzwl -2(%rsi), %edx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_129bytes):
- lddqu -129(%rsi), %xmm0
- movdqu %xmm0, -129(%rdi)
-L(fwd_write_113bytes):
- lddqu -113(%rsi), %xmm0
- movdqu %xmm0, -113(%rdi)
-L(fwd_write_97bytes):
- lddqu -97(%rsi), %xmm0
- movdqu %xmm0, -97(%rdi)
-L(fwd_write_81bytes):
- lddqu -81(%rsi), %xmm0
- movdqu %xmm0, -81(%rdi)
-L(fwd_write_65bytes):
- lddqu -65(%rsi), %xmm0
- movdqu %xmm0, -65(%rdi)
-L(fwd_write_49bytes):
- lddqu -49(%rsi), %xmm0
- movdqu %xmm0, -49(%rdi)
-L(fwd_write_33bytes):
- lddqu -33(%rsi), %xmm0
- movdqu %xmm0, -33(%rdi)
-L(fwd_write_17bytes):
- lddqu -17(%rsi), %xmm0
- lddqu -16(%rsi), %xmm1
- movdqu %xmm0, -17(%rdi)
- movdqu %xmm1, -16(%rdi)
- ret
-
- .p2align 4
-L(fwd_write_1bytes):
- movzbl -1(%rsi), %edx
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(bwd_write_128bytes):
- lddqu 112(%rsi), %xmm0
- movdqu %xmm0, 112(%rdi)
-L(bwd_write_112bytes):
- lddqu 96(%rsi), %xmm0
- movdqu %xmm0, 96(%rdi)
-L(bwd_write_96bytes):
- lddqu 80(%rsi), %xmm0
- movdqu %xmm0, 80(%rdi)
-L(bwd_write_80bytes):
- lddqu 64(%rsi), %xmm0
- movdqu %xmm0, 64(%rdi)
-L(bwd_write_64bytes):
- lddqu 48(%rsi), %xmm0
- movdqu %xmm0, 48(%rdi)
-L(bwd_write_48bytes):
- lddqu 32(%rsi), %xmm0
- movdqu %xmm0, 32(%rdi)
-L(bwd_write_32bytes):
- lddqu 16(%rsi), %xmm0
- movdqu %xmm0, 16(%rdi)
-L(bwd_write_16bytes):
- lddqu (%rsi), %xmm0
- movdqu %xmm0, (%rdi)
-L(bwd_write_0bytes):
- ret
-
- .p2align 4
-L(bwd_write_143bytes):
- lddqu 127(%rsi), %xmm0
- movdqu %xmm0, 127(%rdi)
-L(bwd_write_127bytes):
- lddqu 111(%rsi), %xmm0
- movdqu %xmm0, 111(%rdi)
-L(bwd_write_111bytes):
- lddqu 95(%rsi), %xmm0
- movdqu %xmm0, 95(%rdi)
-L(bwd_write_95bytes):
- lddqu 79(%rsi), %xmm0
- movdqu %xmm0, 79(%rdi)
-L(bwd_write_79bytes):
- lddqu 63(%rsi), %xmm0
- movdqu %xmm0, 63(%rdi)
-L(bwd_write_63bytes):
- lddqu 47(%rsi), %xmm0
- movdqu %xmm0, 47(%rdi)
-L(bwd_write_47bytes):
- lddqu 31(%rsi), %xmm0
- movdqu %xmm0, 31(%rdi)
-L(bwd_write_31bytes):
- lddqu 15(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 15(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
-
- .p2align 4
-L(bwd_write_15bytes):
- mov 7(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 7(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_142bytes):
- lddqu 126(%rsi), %xmm0
- movdqu %xmm0, 126(%rdi)
-L(bwd_write_126bytes):
- lddqu 110(%rsi), %xmm0
- movdqu %xmm0, 110(%rdi)
-L(bwd_write_110bytes):
- lddqu 94(%rsi), %xmm0
- movdqu %xmm0, 94(%rdi)
-L(bwd_write_94bytes):
- lddqu 78(%rsi), %xmm0
- movdqu %xmm0, 78(%rdi)
-L(bwd_write_78bytes):
- lddqu 62(%rsi), %xmm0
- movdqu %xmm0, 62(%rdi)
-L(bwd_write_62bytes):
- lddqu 46(%rsi), %xmm0
- movdqu %xmm0, 46(%rdi)
-L(bwd_write_46bytes):
- lddqu 30(%rsi), %xmm0
- movdqu %xmm0, 30(%rdi)
-L(bwd_write_30bytes):
- lddqu 14(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 14(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_14bytes):
- mov 6(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 6(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_141bytes):
- lddqu 125(%rsi), %xmm0
- movdqu %xmm0, 125(%rdi)
-L(bwd_write_125bytes):
- lddqu 109(%rsi), %xmm0
- movdqu %xmm0, 109(%rdi)
-L(bwd_write_109bytes):
- lddqu 93(%rsi), %xmm0
- movdqu %xmm0, 93(%rdi)
-L(bwd_write_93bytes):
- lddqu 77(%rsi), %xmm0
- movdqu %xmm0, 77(%rdi)
-L(bwd_write_77bytes):
- lddqu 61(%rsi), %xmm0
- movdqu %xmm0, 61(%rdi)
-L(bwd_write_61bytes):
- lddqu 45(%rsi), %xmm0
- movdqu %xmm0, 45(%rdi)
-L(bwd_write_45bytes):
- lddqu 29(%rsi), %xmm0
- movdqu %xmm0, 29(%rdi)
-L(bwd_write_29bytes):
- lddqu 13(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 13(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_13bytes):
- mov 5(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 5(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_140bytes):
- lddqu 124(%rsi), %xmm0
- movdqu %xmm0, 124(%rdi)
-L(bwd_write_124bytes):
- lddqu 108(%rsi), %xmm0
- movdqu %xmm0, 108(%rdi)
-L(bwd_write_108bytes):
- lddqu 92(%rsi), %xmm0
- movdqu %xmm0, 92(%rdi)
-L(bwd_write_92bytes):
- lddqu 76(%rsi), %xmm0
- movdqu %xmm0, 76(%rdi)
-L(bwd_write_76bytes):
- lddqu 60(%rsi), %xmm0
- movdqu %xmm0, 60(%rdi)
-L(bwd_write_60bytes):
- lddqu 44(%rsi), %xmm0
- movdqu %xmm0, 44(%rdi)
-L(bwd_write_44bytes):
- lddqu 28(%rsi), %xmm0
- movdqu %xmm0, 28(%rdi)
-L(bwd_write_28bytes):
- lddqu 12(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 12(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_12bytes):
- mov 4(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 4(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_139bytes):
- lddqu 123(%rsi), %xmm0
- movdqu %xmm0, 123(%rdi)
-L(bwd_write_123bytes):
- lddqu 107(%rsi), %xmm0
- movdqu %xmm0, 107(%rdi)
-L(bwd_write_107bytes):
- lddqu 91(%rsi), %xmm0
- movdqu %xmm0, 91(%rdi)
-L(bwd_write_91bytes):
- lddqu 75(%rsi), %xmm0
- movdqu %xmm0, 75(%rdi)
-L(bwd_write_75bytes):
- lddqu 59(%rsi), %xmm0
- movdqu %xmm0, 59(%rdi)
-L(bwd_write_59bytes):
- lddqu 43(%rsi), %xmm0
- movdqu %xmm0, 43(%rdi)
-L(bwd_write_43bytes):
- lddqu 27(%rsi), %xmm0
- movdqu %xmm0, 27(%rdi)
-L(bwd_write_27bytes):
- lddqu 11(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 11(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_11bytes):
- mov 3(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 3(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_138bytes):
- lddqu 122(%rsi), %xmm0
- movdqu %xmm0, 122(%rdi)
-L(bwd_write_122bytes):
- lddqu 106(%rsi), %xmm0
- movdqu %xmm0, 106(%rdi)
-L(bwd_write_106bytes):
- lddqu 90(%rsi), %xmm0
- movdqu %xmm0, 90(%rdi)
-L(bwd_write_90bytes):
- lddqu 74(%rsi), %xmm0
- movdqu %xmm0, 74(%rdi)
-L(bwd_write_74bytes):
- lddqu 58(%rsi), %xmm0
- movdqu %xmm0, 58(%rdi)
-L(bwd_write_58bytes):
- lddqu 42(%rsi), %xmm0
- movdqu %xmm0, 42(%rdi)
-L(bwd_write_42bytes):
- lddqu 26(%rsi), %xmm0
- movdqu %xmm0, 26(%rdi)
-L(bwd_write_26bytes):
- lddqu 10(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 10(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_10bytes):
- mov 2(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 2(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_137bytes):
- lddqu 121(%rsi), %xmm0
- movdqu %xmm0, 121(%rdi)
-L(bwd_write_121bytes):
- lddqu 105(%rsi), %xmm0
- movdqu %xmm0, 105(%rdi)
-L(bwd_write_105bytes):
- lddqu 89(%rsi), %xmm0
- movdqu %xmm0, 89(%rdi)
-L(bwd_write_89bytes):
- lddqu 73(%rsi), %xmm0
- movdqu %xmm0, 73(%rdi)
-L(bwd_write_73bytes):
- lddqu 57(%rsi), %xmm0
- movdqu %xmm0, 57(%rdi)
-L(bwd_write_57bytes):
- lddqu 41(%rsi), %xmm0
- movdqu %xmm0, 41(%rdi)
-L(bwd_write_41bytes):
- lddqu 25(%rsi), %xmm0
- movdqu %xmm0, 25(%rdi)
-L(bwd_write_25bytes):
- lddqu 9(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 9(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_9bytes):
- mov 1(%rsi), %rdx
- mov (%rsi), %rcx
- mov %rdx, 1(%rdi)
- mov %rcx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_136bytes):
- lddqu 120(%rsi), %xmm0
- movdqu %xmm0, 120(%rdi)
-L(bwd_write_120bytes):
- lddqu 104(%rsi), %xmm0
- movdqu %xmm0, 104(%rdi)
-L(bwd_write_104bytes):
- lddqu 88(%rsi), %xmm0
- movdqu %xmm0, 88(%rdi)
-L(bwd_write_88bytes):
- lddqu 72(%rsi), %xmm0
- movdqu %xmm0, 72(%rdi)
-L(bwd_write_72bytes):
- lddqu 56(%rsi), %xmm0
- movdqu %xmm0, 56(%rdi)
-L(bwd_write_56bytes):
- lddqu 40(%rsi), %xmm0
- movdqu %xmm0, 40(%rdi)
-L(bwd_write_40bytes):
- lddqu 24(%rsi), %xmm0
- movdqu %xmm0, 24(%rdi)
-L(bwd_write_24bytes):
- lddqu 8(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 8(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_8bytes):
- mov (%rsi), %rdx
- mov %rdx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_135bytes):
- lddqu 119(%rsi), %xmm0
- movdqu %xmm0, 119(%rdi)
-L(bwd_write_119bytes):
- lddqu 103(%rsi), %xmm0
- movdqu %xmm0, 103(%rdi)
-L(bwd_write_103bytes):
- lddqu 87(%rsi), %xmm0
- movdqu %xmm0, 87(%rdi)
-L(bwd_write_87bytes):
- lddqu 71(%rsi), %xmm0
- movdqu %xmm0, 71(%rdi)
-L(bwd_write_71bytes):
- lddqu 55(%rsi), %xmm0
- movdqu %xmm0, 55(%rdi)
-L(bwd_write_55bytes):
- lddqu 39(%rsi), %xmm0
- movdqu %xmm0, 39(%rdi)
-L(bwd_write_39bytes):
- lddqu 23(%rsi), %xmm0
- movdqu %xmm0, 23(%rdi)
-L(bwd_write_23bytes):
- lddqu 7(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 7(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_7bytes):
- mov 3(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 3(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_134bytes):
- lddqu 118(%rsi), %xmm0
- movdqu %xmm0, 118(%rdi)
-L(bwd_write_118bytes):
- lddqu 102(%rsi), %xmm0
- movdqu %xmm0, 102(%rdi)
-L(bwd_write_102bytes):
- lddqu 86(%rsi), %xmm0
- movdqu %xmm0, 86(%rdi)
-L(bwd_write_86bytes):
- lddqu 70(%rsi), %xmm0
- movdqu %xmm0, 70(%rdi)
-L(bwd_write_70bytes):
- lddqu 54(%rsi), %xmm0
- movdqu %xmm0, 54(%rdi)
-L(bwd_write_54bytes):
- lddqu 38(%rsi), %xmm0
- movdqu %xmm0, 38(%rdi)
-L(bwd_write_38bytes):
- lddqu 22(%rsi), %xmm0
- movdqu %xmm0, 22(%rdi)
-L(bwd_write_22bytes):
- lddqu 6(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 6(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_6bytes):
- mov 2(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 2(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_133bytes):
- lddqu 117(%rsi), %xmm0
- movdqu %xmm0, 117(%rdi)
-L(bwd_write_117bytes):
- lddqu 101(%rsi), %xmm0
- movdqu %xmm0, 101(%rdi)
-L(bwd_write_101bytes):
- lddqu 85(%rsi), %xmm0
- movdqu %xmm0, 85(%rdi)
-L(bwd_write_85bytes):
- lddqu 69(%rsi), %xmm0
- movdqu %xmm0, 69(%rdi)
-L(bwd_write_69bytes):
- lddqu 53(%rsi), %xmm0
- movdqu %xmm0, 53(%rdi)
-L(bwd_write_53bytes):
- lddqu 37(%rsi), %xmm0
- movdqu %xmm0, 37(%rdi)
-L(bwd_write_37bytes):
- lddqu 21(%rsi), %xmm0
- movdqu %xmm0, 21(%rdi)
-L(bwd_write_21bytes):
- lddqu 5(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 5(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_5bytes):
- mov 1(%rsi), %edx
- mov (%rsi), %ecx
- mov %edx, 1(%rdi)
- mov %ecx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_132bytes):
- lddqu 116(%rsi), %xmm0
- movdqu %xmm0, 116(%rdi)
-L(bwd_write_116bytes):
- lddqu 100(%rsi), %xmm0
- movdqu %xmm0, 100(%rdi)
-L(bwd_write_100bytes):
- lddqu 84(%rsi), %xmm0
- movdqu %xmm0, 84(%rdi)
-L(bwd_write_84bytes):
- lddqu 68(%rsi), %xmm0
- movdqu %xmm0, 68(%rdi)
-L(bwd_write_68bytes):
- lddqu 52(%rsi), %xmm0
- movdqu %xmm0, 52(%rdi)
-L(bwd_write_52bytes):
- lddqu 36(%rsi), %xmm0
- movdqu %xmm0, 36(%rdi)
-L(bwd_write_36bytes):
- lddqu 20(%rsi), %xmm0
- movdqu %xmm0, 20(%rdi)
-L(bwd_write_20bytes):
- lddqu 4(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 4(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_4bytes):
- mov (%rsi), %edx
- mov %edx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_131bytes):
- lddqu 115(%rsi), %xmm0
- movdqu %xmm0, 115(%rdi)
-L(bwd_write_115bytes):
- lddqu 99(%rsi), %xmm0
- movdqu %xmm0, 99(%rdi)
-L(bwd_write_99bytes):
- lddqu 83(%rsi), %xmm0
- movdqu %xmm0, 83(%rdi)
-L(bwd_write_83bytes):
- lddqu 67(%rsi), %xmm0
- movdqu %xmm0, 67(%rdi)
-L(bwd_write_67bytes):
- lddqu 51(%rsi), %xmm0
- movdqu %xmm0, 51(%rdi)
-L(bwd_write_51bytes):
- lddqu 35(%rsi), %xmm0
- movdqu %xmm0, 35(%rdi)
-L(bwd_write_35bytes):
- lddqu 19(%rsi), %xmm0
- movdqu %xmm0, 19(%rdi)
-L(bwd_write_19bytes):
- lddqu 3(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 3(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_3bytes):
- mov 1(%rsi), %dx
- mov (%rsi), %cx
- mov %dx, 1(%rdi)
- mov %cx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_130bytes):
- lddqu 114(%rsi), %xmm0
- movdqu %xmm0, 114(%rdi)
-L(bwd_write_114bytes):
- lddqu 98(%rsi), %xmm0
- movdqu %xmm0, 98(%rdi)
-L(bwd_write_98bytes):
- lddqu 82(%rsi), %xmm0
- movdqu %xmm0, 82(%rdi)
-L(bwd_write_82bytes):
- lddqu 66(%rsi), %xmm0
- movdqu %xmm0, 66(%rdi)
-L(bwd_write_66bytes):
- lddqu 50(%rsi), %xmm0
- movdqu %xmm0, 50(%rdi)
-L(bwd_write_50bytes):
- lddqu 34(%rsi), %xmm0
- movdqu %xmm0, 34(%rdi)
-L(bwd_write_34bytes):
- lddqu 18(%rsi), %xmm0
- movdqu %xmm0, 18(%rdi)
-L(bwd_write_18bytes):
- lddqu 2(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 2(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_2bytes):
- movzwl (%rsi), %edx
- mov %dx, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_129bytes):
- lddqu 113(%rsi), %xmm0
- movdqu %xmm0, 113(%rdi)
-L(bwd_write_113bytes):
- lddqu 97(%rsi), %xmm0
- movdqu %xmm0, 97(%rdi)
-L(bwd_write_97bytes):
- lddqu 81(%rsi), %xmm0
- movdqu %xmm0, 81(%rdi)
-L(bwd_write_81bytes):
- lddqu 65(%rsi), %xmm0
- movdqu %xmm0, 65(%rdi)
-L(bwd_write_65bytes):
- lddqu 49(%rsi), %xmm0
- movdqu %xmm0, 49(%rdi)
-L(bwd_write_49bytes):
- lddqu 33(%rsi), %xmm0
- movdqu %xmm0, 33(%rdi)
-L(bwd_write_33bytes):
- lddqu 17(%rsi), %xmm0
- movdqu %xmm0, 17(%rdi)
-L(bwd_write_17bytes):
- lddqu 1(%rsi), %xmm0
- lddqu (%rsi), %xmm1
- movdqu %xmm0, 1(%rdi)
- movdqu %xmm1, (%rdi)
- ret
-
- .p2align 4
-L(bwd_write_1bytes):
- movzbl (%rsi), %edx
- mov %dl, (%rdi)
- ret
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_144_bytes_bwd):
- .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
- .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
- .p2align 3
-L(table_144_bytes_fwd):
- .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
- .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
- .p2align 3
-L(shl_table_fwd):
- .int JMPTBL (L(shl_0), L(shl_table_fwd))
- .int JMPTBL (L(shl_1), L(shl_table_fwd))
- .int JMPTBL (L(shl_2), L(shl_table_fwd))
- .int JMPTBL (L(shl_3), L(shl_table_fwd))
- .int JMPTBL (L(shl_4), L(shl_table_fwd))
- .int JMPTBL (L(shl_5), L(shl_table_fwd))
- .int JMPTBL (L(shl_6), L(shl_table_fwd))
- .int JMPTBL (L(shl_7), L(shl_table_fwd))
- .int JMPTBL (L(shl_8), L(shl_table_fwd))
- .int JMPTBL (L(shl_9), L(shl_table_fwd))
- .int JMPTBL (L(shl_10), L(shl_table_fwd))
- .int JMPTBL (L(shl_11), L(shl_table_fwd))
- .int JMPTBL (L(shl_12), L(shl_table_fwd))
- .int JMPTBL (L(shl_13), L(shl_table_fwd))
- .int JMPTBL (L(shl_14), L(shl_table_fwd))
- .int JMPTBL (L(shl_15), L(shl_table_fwd))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3_back
-#define MEMCPY_CHK __memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (3 preceding siblings ...)
2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-14 16:47 ` Noah Goldstein
2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
5 siblings, 0 replies; 49+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
To: libc-alpha
The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.
This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.
Aside from this function all other SSSE3 functions should be safe to
remove.
The performance is not changed drastically although shows overall
improvements without any major regressions or gains.
bench-memcpy geometric_mean(N=50) New / Original: 0.957
bench-memcpy-random geometric_mean(N=50) New / Original: 0.912
bench-memcpy-large geometric_mean(N=50) New / Original: 0.892
Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.
More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
Results For: bench-memcpy
length, align1, align2, dst > src, New Time / Old Time
1, 0, 0, 0, 0.946
1, 0, 0, 1, 0.946
1, 32, 0, 0, 0.948
1, 32, 0, 1, 1.185
1, 0, 32, 0, 0.982
1, 0, 32, 1, 1.14
1, 32, 32, 0, 0.981
1, 32, 32, 1, 1.057
1, 2048, 0, 0, 0.945
1, 2048, 0, 1, 0.945
2, 0, 0, 0, 1.041
2, 0, 0, 1, 1.041
2, 1, 0, 0, 1.044
2, 1, 0, 1, 1.044
2, 33, 0, 0, 1.044
2, 33, 0, 1, 1.044
2, 0, 1, 0, 1.041
2, 0, 1, 1, 1.041
2, 0, 33, 0, 1.042
2, 0, 33, 1, 1.041
2, 1, 1, 0, 1.041
2, 1, 1, 1, 1.041
2, 33, 33, 0, 1.041
2, 33, 33, 1, 1.041
2, 2048, 0, 0, 1.042
2, 2048, 0, 1, 1.041
2, 2049, 0, 0, 1.044
2, 2049, 0, 1, 1.044
2, 2048, 1, 0, 1.041
2, 2048, 1, 1, 1.042
2, 2049, 1, 0, 1.042
2, 2049, 1, 1, 1.042
4, 0, 0, 0, 0.962
4, 0, 0, 1, 0.962
4, 2, 0, 0, 0.98
4, 2, 0, 1, 0.984
4, 34, 0, 0, 0.986
4, 34, 0, 1, 0.987
4, 0, 2, 0, 0.962
4, 0, 2, 1, 0.962
4, 0, 34, 0, 0.962
4, 0, 34, 1, 0.962
4, 2, 2, 0, 0.962
4, 2, 2, 1, 0.962
4, 34, 34, 0, 0.962
4, 34, 34, 1, 0.962
4, 2048, 0, 0, 0.962
4, 2048, 0, 1, 0.962
4, 2050, 0, 0, 0.996
4, 2050, 0, 1, 1.0
4, 2048, 2, 0, 0.962
4, 2048, 2, 1, 0.962
4, 2050, 2, 0, 0.962
4, 2050, 2, 1, 0.962
8, 0, 0, 0, 0.962
8, 0, 0, 1, 0.962
8, 3, 0, 0, 1.0
8, 3, 0, 1, 1.0
8, 35, 0, 0, 1.001
8, 35, 0, 1, 1.0
8, 0, 3, 0, 0.962
8, 0, 3, 1, 0.962
8, 0, 35, 0, 0.962
8, 0, 35, 1, 0.962
8, 3, 3, 0, 0.962
8, 3, 3, 1, 0.962
8, 35, 35, 0, 0.962
8, 35, 35, 1, 0.962
8, 2048, 0, 0, 0.962
8, 2048, 0, 1, 0.962
8, 2051, 0, 0, 1.0
8, 2051, 0, 1, 1.0
8, 2048, 3, 0, 0.962
8, 2048, 3, 1, 0.962
8, 2051, 3, 0, 0.962
8, 2051, 3, 1, 0.962
16, 0, 0, 0, 0.798
16, 0, 0, 1, 0.799
16, 4, 0, 0, 0.801
16, 4, 0, 1, 0.801
16, 36, 0, 0, 0.801
16, 36, 0, 1, 0.801
16, 0, 4, 0, 0.798
16, 0, 4, 1, 0.799
16, 0, 36, 0, 0.799
16, 0, 36, 1, 0.799
16, 4, 4, 0, 0.799
16, 4, 4, 1, 0.799
16, 36, 36, 0, 0.799
16, 36, 36, 1, 0.799
16, 2048, 0, 0, 0.799
16, 2048, 0, 1, 0.799
16, 2052, 0, 0, 0.801
16, 2052, 0, 1, 0.801
16, 2048, 4, 0, 0.798
16, 2048, 4, 1, 0.799
16, 2052, 4, 0, 0.799
16, 2052, 4, 1, 0.799
32, 0, 0, 0, 0.472
32, 0, 0, 1, 0.472
32, 5, 0, 0, 0.472
32, 5, 0, 1, 0.472
32, 37, 0, 0, 0.962
32, 37, 0, 1, 0.962
32, 0, 5, 0, 0.472
32, 0, 5, 1, 0.472
32, 0, 37, 0, 1.021
32, 0, 37, 1, 1.021
32, 5, 5, 0, 0.472
32, 5, 5, 1, 0.472
32, 37, 37, 0, 1.011
32, 37, 37, 1, 1.011
32, 2048, 0, 0, 0.472
32, 2048, 0, 1, 0.472
32, 2053, 0, 0, 0.472
32, 2053, 0, 1, 0.472
32, 2048, 5, 0, 0.472
32, 2048, 5, 1, 0.472
32, 2053, 5, 0, 0.472
32, 2053, 5, 1, 0.472
64, 0, 0, 0, 1.0
64, 0, 0, 1, 1.0
64, 6, 0, 0, 0.862
64, 6, 0, 1, 0.862
64, 38, 0, 0, 0.912
64, 38, 0, 1, 0.912
64, 0, 6, 0, 0.896
64, 0, 6, 1, 0.896
64, 0, 38, 0, 0.906
64, 0, 38, 1, 0.906
64, 6, 6, 0, 0.91
64, 6, 6, 1, 0.91
64, 38, 38, 0, 0.883
64, 38, 38, 1, 0.883
64, 2048, 0, 0, 1.0
64, 2048, 0, 1, 1.0
64, 2054, 0, 0, 0.862
64, 2054, 0, 1, 0.862
64, 2048, 6, 0, 0.887
64, 2048, 6, 1, 0.887
64, 2054, 6, 0, 0.887
64, 2054, 6, 1, 0.887
128, 0, 0, 0, 0.857
128, 0, 0, 1, 0.857
128, 7, 0, 0, 0.875
128, 7, 0, 1, 0.875
128, 39, 0, 0, 0.892
128, 39, 0, 1, 0.892
128, 0, 7, 0, 1.183
128, 0, 7, 1, 1.183
128, 0, 39, 0, 1.113
128, 0, 39, 1, 1.113
128, 7, 7, 0, 0.692
128, 7, 7, 1, 0.692
128, 39, 39, 0, 1.104
128, 39, 39, 1, 1.104
128, 2048, 0, 0, 0.857
128, 2048, 0, 1, 0.857
128, 2055, 0, 0, 0.875
128, 2055, 0, 1, 0.875
128, 2048, 7, 0, 0.959
128, 2048, 7, 1, 0.959
128, 2055, 7, 0, 1.036
128, 2055, 7, 1, 1.036
256, 0, 0, 0, 0.889
256, 0, 0, 1, 0.889
256, 8, 0, 0, 0.966
256, 8, 0, 1, 0.966
256, 40, 0, 0, 0.983
256, 40, 0, 1, 0.983
256, 0, 8, 0, 1.29
256, 0, 8, 1, 1.29
256, 0, 40, 0, 1.274
256, 0, 40, 1, 1.274
256, 8, 8, 0, 0.865
256, 8, 8, 1, 0.865
256, 40, 40, 0, 1.477
256, 40, 40, 1, 1.477
256, 2048, 0, 0, 0.889
256, 2048, 0, 1, 0.889
256, 2056, 0, 0, 0.966
256, 2056, 0, 1, 0.966
256, 2048, 8, 0, 0.952
256, 2048, 8, 1, 0.952
256, 2056, 8, 0, 0.878
256, 2056, 8, 1, 0.878
512, 0, 0, 0, 1.077
512, 0, 0, 1, 1.077
512, 9, 0, 0, 1.0
512, 9, 0, 1, 1.0
512, 41, 0, 0, 0.954
512, 41, 0, 1, 0.954
512, 0, 9, 0, 1.191
512, 0, 9, 1, 1.191
512, 0, 41, 0, 1.181
512, 0, 41, 1, 1.181
512, 9, 9, 0, 0.765
512, 9, 9, 1, 0.765
512, 41, 41, 0, 0.905
512, 41, 41, 1, 0.905
512, 2048, 0, 0, 1.077
512, 2048, 0, 1, 1.077
512, 2057, 0, 0, 1.0
512, 2057, 0, 1, 1.0
512, 2048, 9, 0, 1.0
512, 2048, 9, 1, 1.0
512, 2057, 9, 0, 0.733
512, 2057, 9, 1, 0.733
1024, 0, 0, 0, 1.143
1024, 0, 0, 1, 1.143
1024, 10, 0, 0, 1.015
1024, 10, 0, 1, 1.015
1024, 42, 0, 0, 1.045
1024, 42, 0, 1, 1.045
1024, 0, 10, 0, 1.126
1024, 0, 10, 1, 1.126
1024, 0, 42, 0, 1.114
1024, 0, 42, 1, 1.114
1024, 10, 10, 0, 0.89
1024, 10, 10, 1, 0.89
1024, 42, 42, 0, 0.986
1024, 42, 42, 1, 0.986
1024, 2048, 0, 0, 1.143
1024, 2048, 0, 1, 1.143
1024, 2058, 0, 0, 1.015
1024, 2058, 0, 1, 1.015
1024, 2048, 10, 0, 1.03
1024, 2048, 10, 1, 1.03
1024, 2058, 10, 0, 0.854
1024, 2058, 10, 1, 0.854
2048, 0, 0, 0, 1.005
2048, 0, 0, 1, 1.005
2048, 11, 0, 0, 1.013
2048, 11, 0, 1, 1.014
2048, 43, 0, 0, 1.044
2048, 43, 0, 1, 1.044
2048, 0, 11, 0, 1.002
2048, 0, 11, 1, 1.003
2048, 0, 43, 0, 1.003
2048, 0, 43, 1, 1.003
2048, 11, 11, 0, 0.92
2048, 11, 11, 1, 0.92
2048, 43, 43, 0, 1.0
2048, 43, 43, 1, 1.0
2048, 2048, 0, 0, 1.005
2048, 2048, 0, 1, 1.005
2048, 2059, 0, 0, 0.904
2048, 2059, 0, 1, 0.904
2048, 2048, 11, 0, 1.0
2048, 2048, 11, 1, 1.0
2048, 2059, 11, 0, 0.979
2048, 2059, 11, 1, 0.979
4096, 0, 0, 0, 1.014
4096, 0, 0, 1, 1.014
4096, 12, 0, 0, 0.855
4096, 12, 0, 1, 0.855
4096, 44, 0, 0, 0.857
4096, 44, 0, 1, 0.857
4096, 0, 12, 0, 0.932
4096, 0, 12, 1, 0.932
4096, 0, 44, 0, 0.932
4096, 0, 44, 1, 0.933
4096, 12, 12, 0, 0.999
4096, 12, 12, 1, 0.999
4096, 44, 44, 0, 1.051
4096, 44, 44, 1, 1.051
4096, 2048, 0, 0, 1.014
4096, 2048, 0, 1, 1.014
4096, 2060, 0, 0, 0.967
4096, 2060, 0, 1, 0.967
4096, 2048, 12, 0, 0.769
4096, 2048, 12, 1, 0.769
4096, 2060, 12, 0, 0.943
4096, 2060, 12, 1, 0.943
8192, 0, 0, 0, 1.045
8192, 0, 0, 1, 1.046
8192, 13, 0, 0, 0.885
8192, 13, 0, 1, 0.885
8192, 45, 0, 0, 0.887
8192, 45, 0, 1, 0.887
8192, 0, 13, 0, 0.942
8192, 0, 13, 1, 0.942
8192, 0, 45, 0, 0.942
8192, 0, 45, 1, 0.942
8192, 13, 13, 0, 1.03
8192, 13, 13, 1, 1.029
8192, 45, 45, 0, 1.048
8192, 45, 45, 1, 1.049
8192, 2048, 0, 0, 1.048
8192, 2048, 0, 1, 1.048
8192, 2061, 0, 0, 1.011
8192, 2061, 0, 1, 1.011
8192, 2048, 13, 0, 0.789
8192, 2048, 13, 1, 0.788
8192, 2061, 13, 0, 0.991
8192, 2061, 13, 1, 0.992
16384, 0, 0, 0, 1.026
16384, 0, 0, 1, 1.011
16384, 14, 0, 0, 0.943
16384, 14, 0, 1, 0.95
16384, 46, 0, 0, 0.856
16384, 46, 0, 1, 0.86
16384, 0, 14, 0, 0.815
16384, 0, 14, 1, 0.817
16384, 0, 46, 0, 0.859
16384, 0, 46, 1, 0.867
16384, 14, 14, 0, 0.987
16384, 14, 14, 1, 0.979
16384, 46, 46, 0, 1.027
16384, 46, 46, 1, 1.031
16384, 2048, 0, 0, 1.078
16384, 2048, 0, 1, 1.084
16384, 2062, 0, 0, 0.851
16384, 2062, 0, 1, 0.85
16384, 2048, 14, 0, 0.935
16384, 2048, 14, 1, 0.932
16384, 2062, 14, 0, 1.015
16384, 2062, 14, 1, 1.012
32768, 0, 0, 0, 0.978
32768, 0, 0, 1, 0.979
32768, 15, 0, 0, 1.006
32768, 15, 0, 1, 1.006
32768, 47, 0, 0, 1.004
32768, 47, 0, 1, 1.004
32768, 0, 15, 0, 1.045
32768, 0, 15, 1, 1.045
32768, 0, 47, 0, 1.011
32768, 0, 47, 1, 1.011
32768, 15, 15, 0, 0.977
32768, 15, 15, 1, 0.977
32768, 47, 47, 0, 0.96
32768, 47, 47, 1, 0.96
32768, 2048, 0, 0, 0.978
32768, 2048, 0, 1, 0.978
32768, 2063, 0, 0, 1.004
32768, 2063, 0, 1, 1.004
32768, 2048, 15, 0, 1.036
32768, 2048, 15, 1, 1.036
32768, 2063, 15, 0, 0.978
32768, 2063, 15, 1, 0.978
65536, 0, 0, 0, 0.981
65536, 0, 0, 1, 0.981
65536, 16, 0, 0, 0.987
65536, 16, 0, 1, 0.987
65536, 48, 0, 0, 0.968
65536, 48, 0, 1, 0.968
65536, 0, 16, 0, 1.014
65536, 0, 16, 1, 1.014
65536, 0, 48, 0, 0.984
65536, 0, 48, 1, 0.984
65536, 16, 16, 0, 1.01
65536, 16, 16, 1, 1.01
65536, 48, 48, 0, 0.968
65536, 48, 48, 1, 0.968
65536, 2048, 0, 0, 0.982
65536, 2048, 0, 1, 0.982
65536, 2064, 0, 0, 0.987
65536, 2064, 0, 1, 0.987
65536, 2048, 16, 0, 1.012
65536, 2048, 16, 1, 1.012
65536, 2064, 16, 0, 1.007
65536, 2064, 16, 1, 1.007
0, 0, 0, 0, 0.867
0, 2048, 0, 0, 0.867
0, 4095, 0, 0, 0.868
0, 0, 4095, 0, 0.866
1, 1, 0, 0, 1.108
1, 0, 1, 0, 0.946
1, 1, 1, 0, 0.946
1, 2049, 0, 0, 0.947
1, 2048, 1, 0, 0.945
1, 2049, 1, 0, 0.945
1, 4095, 0, 0, 1.482
1, 0, 4095, 0, 0.981
2, 2, 0, 0, 1.044
2, 0, 2, 0, 1.041
2, 2, 2, 0, 1.041
2, 2050, 0, 0, 1.044
2, 2048, 2, 0, 1.042
2, 2050, 2, 0, 1.041
2, 4095, 0, 0, 1.057
2, 0, 4095, 0, 1.022
3, 0, 0, 0, 0.899
3, 3, 0, 0, 0.902
3, 0, 3, 0, 0.9
3, 3, 3, 0, 0.9
3, 2048, 0, 0, 0.9
3, 2051, 0, 0, 0.902
3, 2048, 3, 0, 0.9
3, 2051, 3, 0, 0.9
3, 4095, 0, 0, 0.261
3, 0, 4095, 0, 0.211
4, 4, 0, 0, 0.965
4, 0, 4, 0, 0.962
4, 4, 4, 0, 0.962
4, 2052, 0, 0, 0.969
4, 2048, 4, 0, 0.962
4, 2052, 4, 0, 0.962
4, 4095, 0, 0, 1.971
4, 0, 4095, 0, 1.988
5, 0, 0, 0, 0.898
5, 5, 0, 0, 0.9
5, 0, 5, 0, 0.898
5, 5, 5, 0, 0.898
5, 2048, 0, 0, 0.898
5, 2053, 0, 0, 0.9
5, 2048, 5, 0, 0.898
5, 2053, 5, 0, 0.898
5, 4095, 0, 0, 0.935
5, 0, 4095, 0, 1.02
6, 0, 0, 0, 0.898
6, 6, 0, 0, 0.9
6, 0, 6, 0, 0.898
6, 6, 6, 0, 0.898
6, 2048, 0, 0, 0.898
6, 2054, 0, 0, 0.9
6, 2048, 6, 0, 0.898
6, 2054, 6, 0, 0.898
6, 4095, 0, 0, 0.935
6, 0, 4095, 0, 1.021
7, 0, 0, 0, 0.898
7, 7, 0, 0, 0.9
7, 0, 7, 0, 0.898
7, 7, 7, 0, 0.898
7, 2048, 0, 0, 0.898
7, 2055, 0, 0, 0.9
7, 2048, 7, 0, 0.898
7, 2055, 7, 0, 0.898
7, 4095, 0, 0, 0.935
7, 0, 4095, 0, 1.021
8, 8, 0, 0, 1.001
8, 0, 8, 0, 0.962
8, 8, 8, 0, 0.962
8, 2056, 0, 0, 1.0
8, 2048, 8, 0, 0.962
8, 2056, 8, 0, 0.962
8, 4095, 0, 0, 1.971
8, 0, 4095, 0, 1.988
9, 0, 0, 0, 0.898
9, 9, 0, 0, 0.9
9, 0, 9, 0, 0.899
9, 9, 9, 0, 0.899
9, 2048, 0, 0, 0.899
9, 2057, 0, 0, 0.9
9, 2048, 9, 0, 0.899
9, 2057, 9, 0, 0.899
9, 4095, 0, 0, 0.935
9, 0, 4095, 0, 1.019
10, 0, 0, 0, 0.898
10, 10, 0, 0, 0.9
10, 0, 10, 0, 0.899
10, 10, 10, 0, 0.899
10, 2048, 0, 0, 0.899
10, 2058, 0, 0, 0.9
10, 2048, 10, 0, 0.899
10, 2058, 10, 0, 0.899
10, 4095, 0, 0, 0.935
10, 0, 4095, 0, 1.02
11, 0, 0, 0, 0.898
11, 11, 0, 0, 0.9
11, 0, 11, 0, 0.899
11, 11, 11, 0, 0.899
11, 2048, 0, 0, 0.899
11, 2059, 0, 0, 0.9
11, 2048, 11, 0, 0.899
11, 2059, 11, 0, 0.899
11, 4095, 0, 0, 0.935
11, 0, 4095, 0, 1.02
12, 0, 0, 0, 0.898
12, 12, 0, 0, 0.9
12, 0, 12, 0, 0.899
12, 12, 12, 0, 0.899
12, 2048, 0, 0, 0.899
12, 2060, 0, 0, 0.9
12, 2048, 12, 0, 0.899
12, 2060, 12, 0, 0.899
12, 4095, 0, 0, 0.935
12, 0, 4095, 0, 1.018
13, 0, 0, 0, 0.897
13, 13, 0, 0, 0.901
13, 0, 13, 0, 0.898
13, 13, 13, 0, 0.898
13, 2048, 0, 0, 0.898
13, 2061, 0, 0, 0.9
13, 2048, 13, 0, 0.898
13, 2061, 13, 0, 0.898
13, 4095, 0, 0, 0.935
13, 0, 4095, 0, 1.019
14, 0, 0, 0, 0.897
14, 14, 0, 0, 0.9
14, 0, 14, 0, 0.898
14, 14, 14, 0, 0.898
14, 2048, 0, 0, 0.898
14, 2062, 0, 0, 0.9
14, 2048, 14, 0, 0.898
14, 2062, 14, 0, 0.898
14, 4095, 0, 0, 0.935
14, 0, 4095, 0, 1.02
15, 0, 0, 0, 0.897
15, 15, 0, 0, 0.901
15, 0, 15, 0, 0.898
15, 15, 15, 0, 0.898
15, 2048, 0, 0, 0.898
15, 2063, 0, 0, 0.9
15, 2048, 15, 0, 0.898
15, 2063, 15, 0, 0.898
15, 4095, 0, 0, 0.935
15, 0, 4095, 0, 1.02
16, 16, 0, 0, 0.801
16, 0, 16, 0, 0.799
16, 16, 16, 0, 0.799
16, 2064, 0, 0, 0.801
16, 2048, 16, 0, 0.799
16, 2064, 16, 0, 0.799
16, 4095, 0, 0, 1.818
16, 0, 4095, 0, 1.957
17, 0, 0, 0, 0.798
17, 17, 0, 0, 0.801
17, 0, 17, 0, 0.799
17, 17, 17, 0, 0.799
17, 2048, 0, 0, 0.799
17, 2065, 0, 0, 0.801
17, 2048, 17, 0, 0.799
17, 2065, 17, 0, 0.799
17, 4095, 0, 0, 0.938
17, 0, 4095, 0, 1.021
18, 0, 0, 0, 0.798
18, 18, 0, 0, 0.801
18, 0, 18, 0, 0.799
18, 18, 18, 0, 0.799
18, 2048, 0, 0, 0.799
18, 2066, 0, 0, 0.801
18, 2048, 18, 0, 0.799
18, 2066, 18, 0, 0.799
18, 4095, 0, 0, 0.938
18, 0, 4095, 0, 1.021
19, 0, 0, 0, 0.798
19, 19, 0, 0, 0.801
19, 0, 19, 0, 0.799
19, 19, 19, 0, 0.799
19, 2048, 0, 0, 0.799
19, 2067, 0, 0, 0.801
19, 2048, 19, 0, 0.799
19, 2067, 19, 0, 0.799
19, 4095, 0, 0, 0.938
19, 0, 4095, 0, 1.021
20, 0, 0, 0, 0.798
20, 20, 0, 0, 0.801
20, 0, 20, 0, 0.799
20, 20, 20, 0, 0.799
20, 2048, 0, 0, 0.799
20, 2068, 0, 0, 0.801
20, 2048, 20, 0, 0.799
20, 2068, 20, 0, 0.799
20, 4095, 0, 0, 0.937
20, 0, 4095, 0, 1.021
21, 0, 0, 0, 0.798
21, 21, 0, 0, 0.801
21, 0, 21, 0, 0.799
21, 21, 21, 0, 0.799
21, 2048, 0, 0, 0.799
21, 2069, 0, 0, 0.801
21, 2048, 21, 0, 0.799
21, 2069, 21, 0, 0.799
21, 4095, 0, 0, 0.938
21, 0, 4095, 0, 1.021
22, 0, 0, 0, 0.798
22, 22, 0, 0, 0.801
22, 0, 22, 0, 0.799
22, 22, 22, 0, 0.799
22, 2048, 0, 0, 0.799
22, 2070, 0, 0, 0.801
22, 2048, 22, 0, 0.799
22, 2070, 22, 0, 0.799
22, 4095, 0, 0, 0.938
22, 0, 4095, 0, 1.021
23, 0, 0, 0, 0.798
23, 23, 0, 0, 0.801
23, 0, 23, 0, 0.799
23, 23, 23, 0, 0.799
23, 2048, 0, 0, 0.799
23, 2071, 0, 0, 0.801
23, 2048, 23, 0, 0.799
23, 2071, 23, 0, 0.799
23, 4095, 0, 0, 0.938
23, 0, 4095, 0, 1.021
24, 0, 0, 0, 0.798
24, 24, 0, 0, 0.801
24, 0, 24, 0, 0.799
24, 24, 24, 0, 0.799
24, 2048, 0, 0, 0.799
24, 2072, 0, 0, 0.801
24, 2048, 24, 0, 0.799
24, 2072, 24, 0, 0.799
24, 4095, 0, 0, 0.937
24, 0, 4095, 0, 1.021
25, 0, 0, 0, 0.501
25, 25, 0, 0, 0.502
25, 0, 25, 0, 0.502
25, 25, 25, 0, 0.501
25, 2048, 0, 0, 0.501
25, 2073, 0, 0, 0.502
25, 2048, 25, 0, 0.502
25, 2073, 25, 0, 0.501
25, 4095, 0, 0, 0.974
25, 0, 4095, 0, 0.98
26, 0, 0, 0, 0.501
26, 26, 0, 0, 0.502
26, 0, 26, 0, 0.502
26, 26, 26, 0, 0.501
26, 2048, 0, 0, 0.501
26, 2074, 0, 0, 0.502
26, 2048, 26, 0, 0.502
26, 2074, 26, 0, 0.501
26, 4095, 0, 0, 0.974
26, 0, 4095, 0, 1.0
27, 0, 0, 0, 0.501
27, 27, 0, 0, 0.502
27, 0, 27, 0, 0.502
27, 27, 27, 0, 0.501
27, 2048, 0, 0, 0.501
27, 2075, 0, 0, 0.502
27, 2048, 27, 0, 0.502
27, 2075, 27, 0, 0.501
27, 4095, 0, 0, 0.974
27, 0, 4095, 0, 1.0
28, 0, 0, 0, 0.501
28, 28, 0, 0, 0.502
28, 0, 28, 0, 0.502
28, 28, 28, 0, 0.501
28, 2048, 0, 0, 0.501
28, 2076, 0, 0, 0.502
28, 2048, 28, 0, 0.502
28, 2076, 28, 0, 0.502
28, 4095, 0, 0, 0.974
28, 0, 4095, 0, 1.0
29, 0, 0, 0, 0.472
29, 29, 0, 0, 0.472
29, 0, 29, 0, 0.472
29, 29, 29, 0, 0.472
29, 2048, 0, 0, 0.472
29, 2077, 0, 0, 0.472
29, 2048, 29, 0, 0.472
29, 2077, 29, 0, 0.472
29, 4095, 0, 0, 0.974
29, 0, 4095, 0, 1.0
30, 0, 0, 0, 0.472
30, 30, 0, 0, 0.472
30, 0, 30, 0, 0.472
30, 30, 30, 0, 0.472
30, 2048, 0, 0, 0.472
30, 2078, 0, 0, 0.472
30, 2048, 30, 0, 0.472
30, 2078, 30, 0, 0.472
30, 4095, 0, 0, 0.974
30, 0, 4095, 0, 1.0
31, 0, 0, 0, 0.472
31, 31, 0, 0, 0.472
31, 0, 31, 0, 0.472
31, 31, 31, 0, 0.472
31, 2048, 0, 0, 0.472
31, 2079, 0, 0, 0.472
31, 2048, 31, 0, 0.472
31, 2079, 31, 0, 0.472
31, 4095, 0, 0, 0.974
31, 0, 4095, 0, 1.0
48, 0, 0, 0, 1.0
48, 0, 0, 1, 1.0
48, 3, 0, 0, 1.0
48, 3, 0, 1, 1.0
48, 0, 3, 0, 1.0
48, 0, 3, 1, 1.0
48, 3, 3, 0, 1.0
48, 3, 3, 1, 1.0
48, 2048, 0, 0, 1.0
48, 2048, 0, 1, 1.0
48, 2051, 0, 0, 1.0
48, 2051, 0, 1, 1.0
48, 2048, 3, 0, 1.0
48, 2048, 3, 1, 1.0
48, 2051, 3, 0, 1.0
48, 2051, 3, 1, 1.0
80, 0, 0, 0, 0.781
80, 0, 0, 1, 0.782
80, 5, 0, 0, 0.976
80, 5, 0, 1, 0.976
80, 0, 5, 0, 1.232
80, 0, 5, 1, 1.232
80, 5, 5, 0, 1.542
80, 5, 5, 1, 1.543
80, 2048, 0, 0, 0.781
80, 2048, 0, 1, 0.782
80, 2053, 0, 0, 0.976
80, 2053, 0, 1, 0.976
80, 2048, 5, 0, 1.093
80, 2048, 5, 1, 1.093
80, 2053, 5, 0, 1.371
80, 2053, 5, 1, 1.371
96, 0, 0, 0, 0.758
96, 0, 0, 1, 0.758
96, 6, 0, 0, 0.929
96, 6, 0, 1, 0.929
96, 0, 6, 0, 1.204
96, 0, 6, 1, 1.204
96, 6, 6, 0, 1.559
96, 6, 6, 1, 1.562
96, 2048, 0, 0, 0.758
96, 2048, 0, 1, 0.758
96, 2054, 0, 0, 0.929
96, 2054, 0, 1, 0.929
96, 2048, 6, 0, 1.068
96, 2048, 6, 1, 1.068
96, 2054, 6, 0, 1.562
96, 2054, 6, 1, 1.562
112, 0, 0, 0, 0.736
112, 0, 0, 1, 0.736
112, 7, 0, 0, 0.675
112, 7, 0, 1, 0.675
112, 0, 7, 0, 0.778
112, 0, 7, 1, 0.778
112, 7, 7, 0, 0.909
112, 7, 7, 1, 0.909
112, 2048, 0, 0, 0.736
112, 2048, 0, 1, 0.736
112, 2055, 0, 0, 0.675
112, 2055, 0, 1, 0.675
112, 2048, 7, 0, 0.778
112, 2048, 7, 1, 0.778
112, 2055, 7, 0, 0.909
112, 2055, 7, 1, 0.909
144, 0, 0, 0, 0.857
144, 0, 0, 1, 0.857
144, 9, 0, 0, 0.939
144, 9, 0, 1, 0.939
144, 0, 9, 0, 1.137
144, 0, 9, 1, 1.137
144, 9, 9, 0, 1.514
144, 9, 9, 1, 1.514
144, 2048, 0, 0, 0.857
144, 2048, 0, 1, 0.857
144, 2057, 0, 0, 0.939
144, 2057, 0, 1, 0.939
144, 2048, 9, 0, 0.922
144, 2048, 9, 1, 0.922
144, 2057, 9, 0, 1.514
144, 2057, 9, 1, 1.514
160, 0, 0, 0, 0.698
160, 0, 0, 1, 0.698
160, 10, 0, 0, 0.91
160, 10, 0, 1, 0.91
160, 0, 10, 0, 1.211
160, 0, 10, 1, 1.212
160, 10, 10, 0, 1.357
160, 10, 10, 1, 1.357
160, 2048, 0, 0, 0.698
160, 2048, 0, 1, 0.698
160, 2058, 0, 0, 0.91
160, 2058, 0, 1, 0.91
160, 2048, 10, 0, 0.923
160, 2048, 10, 1, 0.923
160, 2058, 10, 0, 1.357
160, 2058, 10, 1, 1.357
176, 0, 0, 0, 0.796
176, 0, 0, 1, 0.796
176, 11, 0, 0, 0.804
176, 11, 0, 1, 0.804
176, 0, 11, 0, 0.774
176, 0, 11, 1, 0.774
176, 11, 11, 0, 0.814
176, 11, 11, 1, 0.814
176, 2048, 0, 0, 0.796
176, 2048, 0, 1, 0.796
176, 2059, 0, 0, 0.804
176, 2059, 0, 1, 0.804
176, 2048, 11, 0, 0.774
176, 2048, 11, 1, 0.774
176, 2059, 11, 0, 0.814
176, 2059, 11, 1, 0.814
192, 0, 0, 0, 0.778
192, 0, 0, 1, 0.778
192, 12, 0, 0, 0.881
192, 12, 0, 1, 0.881
192, 0, 12, 0, 1.167
192, 0, 12, 1, 1.167
192, 12, 12, 0, 0.841
192, 12, 12, 1, 0.841
192, 2048, 0, 0, 0.778
192, 2048, 0, 1, 0.778
192, 2060, 0, 0, 0.881
192, 2060, 0, 1, 0.881
192, 2048, 12, 0, 0.889
192, 2048, 12, 1, 0.889
192, 2060, 12, 0, 0.906
192, 2060, 12, 1, 0.906
208, 0, 0, 0, 0.833
208, 0, 0, 1, 0.833
208, 13, 0, 0, 0.921
208, 13, 0, 1, 0.921
208, 0, 13, 0, 1.003
208, 0, 13, 1, 0.85
208, 13, 13, 0, 1.333
208, 13, 13, 1, 1.333
208, 2048, 0, 0, 0.834
208, 2048, 0, 1, 0.833
208, 2061, 0, 0, 0.921
208, 2061, 0, 1, 0.921
208, 2048, 13, 0, 0.833
208, 2048, 13, 1, 0.833
208, 2061, 13, 0, 1.333
208, 2061, 13, 1, 1.333
224, 0, 0, 0, 0.93
224, 0, 0, 1, 0.93
224, 14, 0, 0, 1.0
224, 14, 0, 1, 1.0
224, 0, 14, 0, 1.15
224, 0, 14, 1, 1.15
224, 14, 14, 0, 1.452
224, 14, 14, 1, 1.452
224, 2048, 0, 0, 0.93
224, 2048, 0, 1, 0.93
224, 2062, 0, 0, 1.0
224, 2062, 0, 1, 1.0
224, 2048, 14, 0, 0.833
224, 2048, 14, 1, 0.833
224, 2062, 14, 0, 1.452
224, 2062, 14, 1, 1.452
240, 0, 0, 0, 0.909
240, 0, 0, 1, 0.909
240, 15, 0, 0, 0.797
240, 15, 0, 1, 0.797
240, 0, 15, 0, 0.771
240, 0, 15, 1, 0.771
240, 15, 15, 0, 0.93
240, 15, 15, 1, 0.93
240, 2048, 0, 0, 0.909
240, 2048, 0, 1, 0.909
240, 2063, 0, 0, 0.797
240, 2063, 0, 1, 0.797
240, 2048, 15, 0, 0.771
240, 2048, 15, 1, 0.771
240, 2063, 15, 0, 0.93
240, 2063, 15, 1, 0.93
272, 0, 0, 0, 0.9
272, 0, 0, 1, 0.9
272, 17, 0, 0, 1.015
272, 17, 0, 1, 1.015
272, 0, 17, 0, 0.927
272, 0, 17, 1, 0.927
272, 17, 17, 0, 0.892
272, 17, 17, 1, 0.892
272, 2048, 0, 0, 0.9
272, 2048, 0, 1, 0.9
272, 2065, 0, 0, 1.015
272, 2065, 0, 1, 1.015
272, 2048, 17, 0, 0.927
272, 2048, 17, 1, 0.927
272, 2065, 17, 0, 0.878
272, 2065, 17, 1, 0.878
288, 0, 0, 0, 0.882
288, 0, 0, 1, 0.882
288, 18, 0, 0, 0.803
288, 18, 0, 1, 0.803
288, 0, 18, 0, 0.768
288, 0, 18, 1, 0.768
288, 18, 18, 0, 0.882
288, 18, 18, 1, 0.882
288, 2048, 0, 0, 0.882
288, 2048, 0, 1, 0.882
288, 2066, 0, 0, 0.803
288, 2066, 0, 1, 0.803
288, 2048, 18, 0, 0.768
288, 2048, 18, 1, 0.768
288, 2066, 18, 0, 0.882
288, 2066, 18, 1, 0.882
304, 0, 0, 0, 0.865
304, 0, 0, 1, 0.866
304, 19, 0, 0, 0.944
304, 19, 0, 1, 0.944
304, 0, 19, 0, 0.943
304, 0, 19, 1, 0.943
304, 19, 19, 0, 0.956
304, 19, 19, 1, 0.956
304, 2048, 0, 0, 0.865
304, 2048, 0, 1, 0.865
304, 2067, 0, 0, 0.944
304, 2067, 0, 1, 0.944
304, 2048, 19, 0, 0.943
304, 2048, 19, 1, 0.943
304, 2067, 19, 0, 0.947
304, 2067, 19, 1, 0.947
320, 0, 0, 0, 0.944
320, 0, 0, 1, 0.944
320, 20, 0, 0, 0.962
320, 20, 0, 1, 0.962
320, 0, 20, 0, 1.214
320, 0, 20, 1, 1.214
320, 20, 20, 0, 1.365
320, 20, 20, 1, 1.365
320, 2048, 0, 0, 0.944
320, 2048, 0, 1, 0.944
320, 2068, 0, 0, 0.962
320, 2068, 0, 1, 0.962
320, 2048, 20, 0, 0.914
320, 2048, 20, 1, 0.914
320, 2068, 20, 0, 1.365
320, 2068, 20, 1, 1.365
336, 0, 0, 0, 1.0
336, 0, 0, 1, 1.0
336, 21, 0, 0, 0.986
336, 21, 0, 1, 0.986
336, 0, 21, 0, 0.853
336, 0, 21, 1, 0.853
336, 21, 21, 0, 0.843
336, 21, 21, 1, 0.843
336, 2048, 0, 0, 1.0
336, 2048, 0, 1, 1.0
336, 2069, 0, 0, 0.986
336, 2069, 0, 1, 0.986
336, 2048, 21, 0, 0.853
336, 2048, 21, 1, 0.853
336, 2069, 21, 0, 0.831
336, 2069, 21, 1, 0.831
352, 0, 0, 0, 0.98
352, 0, 0, 1, 0.98
352, 22, 0, 0, 0.811
352, 22, 0, 1, 0.811
352, 0, 22, 0, 0.882
352, 0, 22, 1, 0.882
352, 22, 22, 0, 1.1
352, 22, 22, 1, 1.1
352, 2048, 0, 0, 0.98
352, 2048, 0, 1, 0.98
352, 2070, 0, 0, 0.811
352, 2070, 0, 1, 0.811
352, 2048, 22, 0, 0.882
352, 2048, 22, 1, 0.882
352, 2070, 22, 0, 1.1
352, 2070, 22, 1, 1.1
368, 0, 0, 0, 1.058
368, 0, 0, 1, 1.058
368, 23, 0, 0, 1.0
368, 23, 0, 1, 1.0
368, 0, 23, 0, 0.948
368, 0, 23, 1, 0.948
368, 23, 23, 0, 0.723
368, 23, 23, 1, 0.723
368, 2048, 0, 0, 1.058
368, 2048, 0, 1, 1.058
368, 2071, 0, 0, 1.0
368, 2071, 0, 1, 1.0
368, 2048, 23, 0, 0.948
368, 2048, 23, 1, 0.948
368, 2071, 23, 0, 0.701
368, 2071, 23, 1, 0.701
384, 0, 0, 0, 1.012
384, 0, 0, 1, 1.012
384, 24, 0, 0, 1.04
384, 24, 0, 1, 1.04
384, 0, 24, 0, 1.154
384, 0, 24, 1, 1.154
384, 24, 24, 0, 1.423
384, 24, 24, 1, 1.423
384, 2048, 0, 0, 1.012
384, 2048, 0, 1, 1.012
384, 2072, 0, 0, 1.04
384, 2072, 0, 1, 1.04
384, 2048, 24, 0, 0.91
384, 2048, 24, 1, 0.91
384, 2072, 24, 0, 1.423
384, 2072, 24, 1, 1.423
400, 0, 0, 0, 0.948
400, 0, 0, 1, 0.948
400, 25, 0, 0, 0.957
400, 25, 0, 1, 0.957
400, 0, 25, 0, 1.054
400, 0, 25, 1, 1.097
400, 25, 25, 0, 0.885
400, 25, 25, 1, 0.885
400, 2048, 0, 0, 0.948
400, 2048, 0, 1, 0.948
400, 2073, 0, 0, 0.957
400, 2073, 0, 1, 0.957
400, 2048, 25, 0, 0.94
400, 2048, 25, 1, 0.94
400, 2073, 25, 0, 0.908
400, 2073, 25, 1, 0.908
416, 0, 0, 0, 1.017
416, 0, 0, 1, 1.017
416, 26, 0, 0, 0.903
416, 26, 0, 1, 0.903
416, 0, 26, 0, 0.881
416, 0, 26, 1, 0.881
416, 26, 26, 0, 1.035
416, 26, 26, 1, 1.035
416, 2048, 0, 0, 1.017
416, 2048, 0, 1, 1.017
416, 2074, 0, 0, 0.903
416, 2074, 0, 1, 0.903
416, 2048, 26, 0, 0.881
416, 2048, 26, 1, 0.881
416, 2074, 26, 0, 1.035
416, 2074, 26, 1, 1.035
432, 0, 0, 0, 1.0
432, 0, 0, 1, 1.0
432, 27, 0, 0, 0.933
432, 27, 0, 1, 0.933
432, 0, 27, 0, 0.941
432, 0, 27, 1, 0.941
432, 27, 27, 0, 0.953
432, 27, 27, 1, 0.954
432, 2048, 0, 0, 1.0
432, 2048, 0, 1, 1.0
432, 2075, 0, 0, 0.933
432, 2075, 0, 1, 0.933
432, 2048, 27, 0, 0.941
432, 2048, 27, 1, 0.941
432, 2075, 27, 0, 0.93
432, 2075, 27, 1, 0.93
448, 0, 0, 0, 0.984
448, 0, 0, 1, 0.984
448, 28, 0, 0, 0.896
448, 28, 0, 1, 0.896
448, 0, 28, 0, 1.244
448, 0, 28, 1, 1.244
448, 28, 28, 0, 1.333
448, 28, 28, 1, 1.333
448, 2048, 0, 0, 0.984
448, 2048, 0, 1, 0.984
448, 2076, 0, 0, 0.896
448, 2076, 0, 1, 0.896
448, 2048, 28, 0, 0.988
448, 2048, 28, 1, 0.988
448, 2076, 28, 0, 1.333
448, 2076, 28, 1, 1.333
464, 0, 0, 0, 1.083
464, 0, 0, 1, 1.083
464, 29, 0, 0, 0.978
464, 29, 0, 1, 0.978
464, 0, 29, 0, 0.924
464, 0, 29, 1, 0.924
464, 29, 29, 0, 0.901
464, 29, 29, 1, 0.901
464, 2048, 0, 0, 1.083
464, 2048, 0, 1, 1.083
464, 2077, 0, 0, 0.978
464, 2077, 0, 1, 0.978
464, 2048, 29, 0, 0.924
464, 2048, 29, 1, 0.924
464, 2077, 29, 0, 0.89
464, 2077, 29, 1, 0.89
480, 0, 0, 0, 1.066
480, 0, 0, 1, 1.066
480, 30, 0, 0, 0.9
480, 30, 0, 1, 0.9
480, 0, 30, 0, 0.88
480, 0, 30, 1, 0.88
480, 30, 30, 0, 1.083
480, 30, 30, 1, 1.083
480, 2048, 0, 0, 1.066
480, 2048, 0, 1, 1.066
480, 2078, 0, 0, 0.9
480, 2078, 0, 1, 0.9
480, 2048, 30, 0, 0.88
480, 2048, 30, 1, 0.88
480, 2078, 30, 0, 1.083
480, 2078, 30, 1, 1.083
496, 0, 0, 0, 1.032
496, 0, 0, 1, 1.032
496, 31, 0, 0, 0.95
496, 31, 0, 1, 0.95
496, 0, 31, 0, 1.011
496, 0, 31, 1, 1.011
496, 31, 31, 0, 0.973
496, 31, 31, 1, 0.973
496, 2048, 0, 0, 1.032
496, 2048, 0, 1, 1.032
496, 2079, 0, 0, 0.95
496, 2079, 0, 1, 0.95
496, 2048, 31, 0, 1.011
496, 2048, 31, 1, 1.011
496, 2079, 31, 0, 0.941
496, 2079, 31, 1, 0.941
1024, 32, 0, 0, 1.143
1024, 32, 0, 1, 1.143
1024, 0, 32, 0, 1.143
1024, 0, 32, 1, 1.143
1024, 32, 32, 0, 1.143
1024, 32, 32, 1, 1.143
1024, 2080, 0, 0, 1.143
1024, 2080, 0, 1, 1.143
1024, 2048, 32, 0, 1.143
1024, 2048, 32, 1, 1.143
1024, 2080, 32, 0, 1.143
1024, 2080, 32, 1, 1.143
1056, 0, 0, 0, 1.165
1056, 0, 0, 1, 1.162
1056, 33, 0, 0, 1.067
1056, 33, 0, 1, 1.067
1056, 0, 33, 0, 0.977
1056, 0, 33, 1, 0.977
1056, 33, 33, 0, 1.043
1056, 33, 33, 1, 1.043
1056, 2048, 0, 0, 1.168
1056, 2048, 0, 1, 1.168
1056, 2081, 0, 0, 1.067
1056, 2081, 0, 1, 1.067
1056, 2048, 33, 0, 0.977
1056, 2048, 33, 1, 0.977
1056, 2081, 33, 0, 1.0
1056, 2081, 33, 1, 1.0
1088, 0, 0, 0, 1.171
1088, 0, 0, 1, 1.171
1088, 34, 0, 0, 1.041
1088, 34, 0, 1, 1.041
1088, 0, 34, 0, 1.079
1088, 0, 34, 1, 1.079
1088, 34, 34, 0, 0.966
1088, 34, 34, 1, 0.966
1088, 2048, 0, 0, 1.171
1088, 2048, 0, 1, 1.171
1088, 2082, 0, 0, 1.041
1088, 2082, 0, 1, 1.041
1088, 2048, 34, 0, 0.994
1088, 2048, 34, 1, 0.994
1088, 2082, 34, 0, 0.966
1088, 2082, 34, 1, 0.966
1120, 0, 0, 0, 1.154
1120, 0, 0, 1, 1.151
1120, 35, 0, 0, 1.051
1120, 35, 0, 1, 1.051
1120, 0, 35, 0, 1.0
1120, 0, 35, 1, 1.0
1120, 35, 35, 0, 1.068
1120, 35, 35, 1, 1.068
1120, 2048, 0, 0, 1.151
1120, 2048, 0, 1, 1.151
1120, 2083, 0, 0, 1.051
1120, 2083, 0, 1, 1.051
1120, 2048, 35, 0, 1.0
1120, 2048, 35, 1, 1.0
1120, 2083, 35, 0, 1.027
1120, 2083, 35, 1, 1.027
1152, 0, 0, 0, 1.159
1152, 0, 0, 1, 1.159
1152, 36, 0, 0, 1.034
1152, 36, 0, 1, 1.034
1152, 0, 36, 0, 1.07
1152, 0, 36, 1, 1.07
1152, 36, 36, 0, 0.967
1152, 36, 36, 1, 0.967
1152, 2048, 0, 0, 1.159
1152, 2048, 0, 1, 1.159
1152, 2084, 0, 0, 1.034
1152, 2084, 0, 1, 1.034
1152, 2048, 36, 0, 0.984
1152, 2048, 36, 1, 0.984
1152, 2084, 36, 0, 0.967
1152, 2084, 36, 1, 0.967
1184, 0, 0, 0, 1.157
1184, 0, 0, 1, 1.157
1184, 37, 0, 0, 1.066
1184, 37, 0, 1, 1.066
1184, 0, 37, 0, 0.993
1184, 0, 37, 1, 0.993
1184, 37, 37, 0, 1.08
1184, 37, 37, 1, 1.081
1184, 2048, 0, 0, 1.157
1184, 2048, 0, 1, 1.157
1184, 2085, 0, 0, 1.066
1184, 2085, 0, 1, 1.066
1184, 2048, 37, 0, 0.993
1184, 2048, 37, 1, 0.993
1184, 2085, 37, 0, 1.04
1184, 2085, 37, 1, 1.04
1216, 0, 0, 0, 1.139
1216, 0, 0, 1, 1.139
1216, 38, 0, 0, 1.024
1216, 38, 0, 1, 1.024
1216, 0, 38, 0, 1.086
1216, 0, 38, 1, 1.087
1216, 38, 38, 0, 1.0
1216, 38, 38, 1, 1.0
1216, 2048, 0, 0, 1.138
1216, 2048, 0, 1, 1.138
1216, 2086, 0, 0, 1.024
1216, 2086, 0, 1, 1.024
1216, 2048, 38, 0, 1.01
1216, 2048, 38, 1, 1.01
1216, 2086, 38, 0, 1.0
1216, 2086, 38, 1, 1.0
1248, 0, 0, 0, 1.175
1248, 0, 0, 1, 1.174
1248, 39, 0, 0, 1.074
1248, 39, 0, 1, 1.074
1248, 0, 39, 0, 0.975
1248, 0, 39, 1, 0.985
1248, 39, 39, 0, 1.064
1248, 39, 39, 1, 1.064
1248, 2048, 0, 0, 1.179
1248, 2048, 0, 1, 1.178
1248, 2087, 0, 0, 1.074
1248, 2087, 0, 1, 1.074
1248, 2048, 39, 0, 0.985
1248, 2048, 39, 1, 0.985
1248, 2087, 39, 0, 1.026
1248, 2087, 39, 1, 1.026
1280, 0, 0, 0, 0.992
1280, 0, 0, 1, 0.992
1280, 40, 0, 0, 1.051
1280, 40, 0, 1, 1.051
1280, 0, 40, 0, 1.044
1280, 0, 40, 1, 1.044
1280, 40, 40, 0, 1.252
1280, 40, 40, 1, 1.252
1280, 2048, 0, 0, 0.992
1280, 2048, 0, 1, 0.992
1280, 2088, 0, 0, 1.051
1280, 2088, 0, 1, 1.051
1280, 2048, 40, 0, 0.946
1280, 2048, 40, 1, 0.946
1280, 2088, 40, 0, 1.252
1280, 2088, 40, 1, 1.252
1312, 0, 0, 0, 0.969
1312, 0, 0, 1, 0.969
1312, 41, 0, 0, 0.988
1312, 41, 0, 1, 0.988
1312, 0, 41, 0, 0.837
1312, 0, 41, 1, 0.837
1312, 41, 41, 0, 1.025
1312, 41, 41, 1, 1.025
1312, 2048, 0, 0, 0.969
1312, 2048, 0, 1, 0.969
1312, 2089, 0, 0, 0.988
1312, 2089, 0, 1, 0.987
1312, 2048, 41, 0, 0.837
1312, 2048, 41, 1, 0.837
1312, 2089, 41, 0, 0.975
1312, 2089, 41, 1, 0.975
1344, 0, 0, 0, 0.987
1344, 0, 0, 1, 0.988
1344, 42, 0, 0, 1.031
1344, 42, 0, 1, 1.031
1344, 0, 42, 0, 1.033
1344, 0, 42, 1, 1.033
1344, 42, 42, 0, 0.982
1344, 42, 42, 1, 0.982
1344, 2048, 0, 0, 0.992
1344, 2048, 0, 1, 0.992
1344, 2090, 0, 0, 1.031
1344, 2090, 0, 1, 1.031
1344, 2048, 42, 0, 0.943
1344, 2048, 42, 1, 0.943
1344, 2090, 42, 0, 0.982
1344, 2090, 42, 1, 0.982
1376, 0, 0, 0, 1.016
1376, 0, 0, 1, 1.016
1376, 43, 0, 0, 1.005
1376, 43, 0, 1, 1.005
1376, 0, 43, 0, 0.829
1376, 0, 43, 1, 0.829
1376, 43, 43, 0, 1.024
1376, 43, 43, 1, 1.024
1376, 2048, 0, 0, 1.005
1376, 2048, 0, 1, 1.013
1376, 2091, 0, 0, 1.005
1376, 2091, 0, 1, 1.005
1376, 2048, 43, 0, 0.829
1376, 2048, 43, 1, 0.829
1376, 2091, 43, 0, 0.98
1376, 2091, 43, 1, 0.98
1408, 0, 0, 0, 0.988
1408, 0, 0, 1, 0.988
1408, 44, 0, 0, 1.015
1408, 44, 0, 1, 1.015
1408, 0, 44, 0, 1.023
1408, 0, 44, 1, 1.03
1408, 44, 44, 0, 0.998
1408, 44, 44, 1, 0.994
1408, 2048, 0, 0, 0.988
1408, 2048, 0, 1, 0.988
1408, 2092, 0, 0, 1.015
1408, 2092, 0, 1, 1.015
1408, 2048, 44, 0, 0.955
1408, 2048, 44, 1, 0.955
1408, 2092, 44, 0, 0.999
1408, 2092, 44, 1, 0.994
1440, 0, 0, 0, 0.986
1440, 0, 0, 1, 0.986
1440, 45, 0, 0, 1.008
1440, 45, 0, 1, 1.008
1440, 0, 45, 0, 0.814
1440, 0, 45, 1, 0.814
1440, 45, 45, 0, 1.006
1440, 45, 45, 1, 1.006
1440, 2048, 0, 0, 0.986
1440, 2048, 0, 1, 0.986
1440, 2093, 0, 0, 1.008
1440, 2093, 0, 1, 1.008
1440, 2048, 45, 0, 0.814
1440, 2048, 45, 1, 0.814
1440, 2093, 45, 0, 0.966
1440, 2093, 45, 1, 0.966
1472, 0, 0, 0, 0.993
1472, 0, 0, 1, 0.992
1472, 46, 0, 0, 1.045
1472, 46, 0, 1, 1.045
1472, 0, 46, 0, 1.026
1472, 0, 46, 1, 1.026
1472, 46, 46, 0, 0.966
1472, 46, 46, 1, 0.966
1472, 2048, 0, 0, 0.999
1472, 2048, 0, 1, 0.997
1472, 2094, 0, 0, 1.045
1472, 2094, 0, 1, 1.045
1472, 2048, 46, 0, 0.939
1472, 2048, 46, 1, 0.939
1472, 2094, 46, 0, 0.966
1472, 2094, 46, 1, 0.966
1504, 0, 0, 0, 0.991
1504, 0, 0, 1, 0.991
1504, 47, 0, 0, 0.999
1504, 47, 0, 1, 0.999
1504, 0, 47, 0, 0.826
1504, 0, 47, 1, 0.826
1504, 47, 47, 0, 1.023
1504, 47, 47, 1, 1.023
1504, 2048, 0, 0, 0.993
1504, 2048, 0, 1, 0.993
1504, 2095, 0, 0, 0.999
1504, 2095, 0, 1, 0.999
1504, 2048, 47, 0, 0.826
1504, 2048, 47, 1, 0.826
1504, 2095, 47, 0, 0.993
1504, 2095, 47, 1, 0.993
1536, 0, 0, 0, 0.994
1536, 0, 0, 1, 0.993
1536, 48, 0, 0, 1.019
1536, 48, 0, 1, 1.019
1536, 0, 48, 0, 1.025
1536, 0, 48, 1, 1.025
1536, 48, 48, 0, 0.993
1536, 48, 48, 1, 0.993
1536, 2048, 0, 0, 0.994
1536, 2048, 0, 1, 0.994
1536, 2096, 0, 0, 1.019
1536, 2096, 0, 1, 1.019
1536, 2048, 48, 0, 1.025
1536, 2048, 48, 1, 1.025
1536, 2096, 48, 0, 0.994
1536, 2096, 48, 1, 0.994
1568, 0, 0, 0, 0.994
1568, 0, 0, 1, 0.994
1568, 49, 0, 0, 0.903
1568, 49, 0, 1, 0.903
1568, 0, 49, 0, 1.147
1568, 0, 49, 1, 1.147
1568, 49, 49, 0, 1.461
1568, 49, 49, 1, 1.46
1568, 2048, 0, 0, 0.994
1568, 2048, 0, 1, 0.993
1568, 2097, 0, 0, 0.903
1568, 2097, 0, 1, 0.903
1568, 2048, 49, 0, 1.09
1568, 2048, 49, 1, 1.09
1568, 2097, 49, 0, 1.46
1568, 2097, 49, 1, 1.46
1600, 0, 0, 0, 0.981
1600, 0, 0, 1, 0.981
1600, 50, 0, 0, 1.022
1600, 50, 0, 1, 1.022
1600, 0, 50, 0, 1.017
1600, 0, 50, 1, 1.017
1600, 50, 50, 0, 0.973
1600, 50, 50, 1, 0.973
1600, 2048, 0, 0, 0.981
1600, 2048, 0, 1, 0.981
1600, 2098, 0, 0, 1.022
1600, 2098, 0, 1, 1.022
1600, 2048, 50, 0, 0.961
1600, 2048, 50, 1, 0.961
1600, 2098, 50, 0, 0.973
1600, 2098, 50, 1, 0.973
1632, 0, 0, 0, 1.018
1632, 0, 0, 1, 1.018
1632, 51, 0, 0, 0.893
1632, 51, 0, 1, 0.893
1632, 0, 51, 0, 1.134
1632, 0, 51, 1, 1.134
1632, 51, 51, 0, 1.444
1632, 51, 51, 1, 1.444
1632, 2048, 0, 0, 1.019
1632, 2048, 0, 1, 1.019
1632, 2099, 0, 0, 0.893
1632, 2099, 0, 1, 0.893
1632, 2048, 51, 0, 1.079
1632, 2048, 51, 1, 1.079
1632, 2099, 51, 0, 1.449
1632, 2099, 51, 1, 1.449
1664, 0, 0, 0, 1.006
1664, 0, 0, 1, 1.006
1664, 52, 0, 0, 0.982
1664, 52, 0, 1, 0.986
1664, 0, 52, 0, 1.004
1664, 0, 52, 1, 1.004
1664, 52, 52, 0, 0.976
1664, 52, 52, 1, 0.976
1664, 2048, 0, 0, 1.006
1664, 2048, 0, 1, 1.006
1664, 2100, 0, 0, 0.983
1664, 2100, 0, 1, 0.983
1664, 2048, 52, 0, 0.946
1664, 2048, 52, 1, 0.946
1664, 2100, 52, 0, 0.976
1664, 2100, 52, 1, 0.976
1696, 0, 0, 0, 0.99
1696, 0, 0, 1, 0.99
1696, 53, 0, 0, 0.884
1696, 53, 0, 1, 0.884
1696, 0, 53, 0, 1.141
1696, 0, 53, 1, 1.141
1696, 53, 53, 0, 1.43
1696, 53, 53, 1, 1.428
1696, 2048, 0, 0, 0.994
1696, 2048, 0, 1, 0.993
1696, 2101, 0, 0, 0.884
1696, 2101, 0, 1, 0.884
1696, 2048, 53, 0, 1.088
1696, 2048, 53, 1, 1.088
1696, 2101, 53, 0, 1.429
1696, 2101, 53, 1, 1.429
1728, 0, 0, 0, 0.978
1728, 0, 0, 1, 0.977
1728, 54, 0, 0, 1.032
1728, 54, 0, 1, 1.033
1728, 0, 54, 0, 1.0
1728, 0, 54, 1, 1.0
1728, 54, 54, 0, 0.96
1728, 54, 54, 1, 0.96
1728, 2048, 0, 0, 0.976
1728, 2048, 0, 1, 0.976
1728, 2102, 0, 0, 1.033
1728, 2102, 0, 1, 1.033
1728, 2048, 54, 0, 0.947
1728, 2048, 54, 1, 0.947
1728, 2102, 54, 0, 0.96
1728, 2102, 54, 1, 0.96
1760, 0, 0, 0, 1.019
1760, 0, 0, 1, 1.022
1760, 55, 0, 0, 0.9
1760, 55, 0, 1, 0.9
1760, 0, 55, 0, 1.125
1760, 0, 55, 1, 1.125
1760, 55, 55, 0, 1.438
1760, 55, 55, 1, 1.439
1760, 2048, 0, 0, 1.015
1760, 2048, 0, 1, 1.015
1760, 2103, 0, 0, 0.9
1760, 2103, 0, 1, 0.9
1760, 2048, 55, 0, 1.073
1760, 2048, 55, 1, 1.074
1760, 2103, 55, 0, 1.435
1760, 2103, 55, 1, 1.44
1792, 0, 0, 0, 1.003
1792, 0, 0, 1, 1.002
1792, 56, 0, 0, 1.028
1792, 56, 0, 1, 1.028
1792, 0, 56, 0, 1.014
1792, 0, 56, 1, 1.015
1792, 56, 56, 0, 1.191
1792, 56, 56, 1, 1.191
1792, 2048, 0, 0, 1.003
1792, 2048, 0, 1, 1.003
1792, 2104, 0, 0, 1.028
1792, 2104, 0, 1, 1.028
1792, 2048, 56, 0, 0.963
1792, 2048, 56, 1, 0.963
1792, 2104, 56, 0, 1.191
1792, 2104, 56, 1, 1.191
1824, 0, 0, 0, 1.001
1824, 0, 0, 1, 1.001
1824, 57, 0, 0, 0.891
1824, 57, 0, 1, 0.891
1824, 0, 57, 0, 1.114
1824, 0, 57, 1, 1.114
1824, 57, 57, 0, 1.407
1824, 57, 57, 1, 1.407
1824, 2048, 0, 0, 1.001
1824, 2048, 0, 1, 1.001
1824, 2105, 0, 0, 0.891
1824, 2105, 0, 1, 0.891
1824, 2048, 57, 0, 1.064
1824, 2048, 57, 1, 1.064
1824, 2105, 57, 0, 1.407
1824, 2105, 57, 1, 1.407
1856, 0, 0, 0, 0.991
1856, 0, 0, 1, 0.991
1856, 58, 0, 0, 1.042
1856, 58, 0, 1, 1.042
1856, 0, 58, 0, 1.007
1856, 0, 58, 1, 1.007
1856, 58, 58, 0, 0.98
1856, 58, 58, 1, 0.972
1856, 2048, 0, 0, 0.992
1856, 2048, 0, 1, 0.992
1856, 2106, 0, 0, 1.042
1856, 2106, 0, 1, 1.042
1856, 2048, 58, 0, 0.954
1856, 2048, 58, 1, 0.954
1856, 2106, 58, 0, 0.98
1856, 2106, 58, 1, 0.972
1888, 0, 0, 0, 0.993
1888, 0, 0, 1, 0.992
1888, 59, 0, 0, 0.883
1888, 59, 0, 1, 0.883
1888, 0, 59, 0, 1.124
1888, 0, 59, 1, 1.125
1888, 59, 59, 0, 1.413
1888, 59, 59, 1, 1.413
1888, 2048, 0, 0, 0.986
1888, 2048, 0, 1, 0.991
1888, 2107, 0, 0, 0.883
1888, 2107, 0, 1, 0.883
1888, 2048, 59, 0, 1.076
1888, 2048, 59, 1, 1.076
1888, 2107, 59, 0, 1.413
1888, 2107, 59, 1, 1.413
1920, 0, 0, 0, 1.0
1920, 0, 0, 1, 1.0
1920, 60, 0, 0, 1.033
1920, 60, 0, 1, 1.034
1920, 0, 60, 0, 0.996
1920, 0, 60, 1, 0.997
1920, 60, 60, 0, 0.968
1920, 60, 60, 1, 0.968
1920, 2048, 0, 0, 1.0
1920, 2048, 0, 1, 1.0
1920, 2108, 0, 0, 1.034
1920, 2108, 0, 1, 1.034
1920, 2048, 60, 0, 0.949
1920, 2048, 60, 1, 0.949
1920, 2108, 60, 0, 0.968
1920, 2108, 60, 1, 0.968
1952, 0, 0, 0, 1.004
1952, 0, 0, 1, 1.004
1952, 61, 0, 0, 0.897
1952, 61, 0, 1, 0.898
1952, 0, 61, 0, 1.118
1952, 0, 61, 1, 1.118
1952, 61, 61, 0, 1.387
1952, 61, 61, 1, 1.387
1952, 2048, 0, 0, 1.004
1952, 2048, 0, 1, 1.004
1952, 2109, 0, 0, 0.898
1952, 2109, 0, 1, 0.898
1952, 2048, 61, 0, 1.071
1952, 2048, 61, 1, 1.071
1952, 2109, 61, 0, 1.387
1952, 2109, 61, 1, 1.387
1984, 0, 0, 0, 0.993
1984, 0, 0, 1, 0.993
1984, 62, 0, 0, 1.025
1984, 62, 0, 1, 1.025
1984, 0, 62, 0, 1.005
1984, 0, 62, 1, 1.007
1984, 62, 62, 0, 0.982
1984, 62, 62, 1, 0.982
1984, 2048, 0, 0, 0.993
1984, 2048, 0, 1, 0.993
1984, 2110, 0, 0, 1.025
1984, 2110, 0, 1, 1.025
1984, 2048, 62, 0, 0.96
1984, 2048, 62, 1, 0.96
1984, 2110, 62, 0, 0.982
1984, 2110, 62, 1, 0.982
2016, 0, 0, 0, 0.999
2016, 0, 0, 1, 0.999
2016, 63, 0, 0, 0.889
2016, 63, 0, 1, 0.89
2016, 0, 63, 0, 1.093
2016, 0, 63, 1, 1.094
2016, 63, 63, 0, 1.362
2016, 63, 63, 1, 1.363
2016, 2048, 0, 0, 1.0
2016, 2048, 0, 1, 1.0
2016, 2111, 0, 0, 0.965
2016, 2111, 0, 1, 0.965
2016, 2048, 63, 0, 1.049
2016, 2048, 63, 1, 1.049
2016, 2111, 63, 0, 1.405
2016, 2111, 63, 1, 1.405
2048, 32, 0, 0, 1.01
2048, 32, 0, 1, 1.01
2048, 0, 32, 0, 1.005
2048, 0, 32, 1, 1.005
2048, 32, 32, 0, 1.005
2048, 32, 32, 1, 1.005
2048, 0, 1, 0, 0.983
2048, 0, 1, 1, 0.984
2048, 1, 0, 0, 1.039
2048, 1, 0, 1, 1.039
2048, 32, 1, 0, 1.063
2048, 32, 1, 1, 1.063
2048, 1, 32, 0, 0.94
2048, 1, 32, 1, 0.94
2048, 2048, 1, 0, 0.981
2048, 2048, 1, 1, 0.981
2048, 2049, 0, 0, 0.904
2048, 2049, 0, 1, 0.904
2112, 0, 0, 0, 0.996
2112, 0, 0, 1, 0.996
2112, 1, 0, 0, 1.031
2112, 1, 0, 1, 1.031
2112, 33, 0, 0, 1.01
2112, 33, 0, 1, 1.01
2112, 0, 1, 0, 0.972
2112, 0, 1, 1, 0.972
2112, 0, 33, 0, 0.988
2112, 0, 33, 1, 0.988
2112, 1, 1, 0, 0.914
2112, 1, 1, 1, 0.914
2112, 33, 33, 0, 0.983
2112, 33, 33, 1, 0.983
2112, 2048, 0, 0, 0.993
2112, 2048, 0, 1, 0.991
2112, 2049, 0, 0, 1.031
2112, 2049, 0, 1, 1.031
2112, 2048, 1, 0, 0.955
2112, 2048, 1, 1, 0.955
2112, 2049, 1, 0, 0.906
2112, 2049, 1, 1, 0.906
2112, 33, 1, 0, 1.163
2112, 33, 1, 1, 1.164
2112, 1, 33, 0, 1.046
2112, 1, 33, 1, 1.046
2176, 0, 0, 0, 0.985
2176, 0, 0, 1, 0.985
2176, 2, 0, 0, 1.023
2176, 2, 0, 1, 1.023
2176, 34, 0, 0, 1.0
2176, 34, 0, 1, 1.0
2176, 0, 2, 0, 0.984
2176, 0, 2, 1, 0.985
2176, 0, 34, 0, 0.986
2176, 0, 34, 1, 0.993
2176, 2, 2, 0, 0.928
2176, 2, 2, 1, 0.928
2176, 34, 34, 0, 1.004
2176, 34, 34, 1, 1.004
2176, 2048, 0, 0, 0.985
2176, 2048, 0, 1, 0.985
2176, 2050, 0, 0, 1.023
2176, 2050, 0, 1, 1.023
2176, 2048, 2, 0, 0.802
2176, 2048, 2, 1, 0.802
2176, 2050, 2, 0, 0.894
2176, 2050, 2, 1, 0.894
2176, 2, 1, 0, 1.068
2176, 2, 1, 1, 1.068
2176, 1, 2, 0, 0.976
2176, 1, 2, 1, 0.976
2176, 34, 1, 0, 1.077
2176, 34, 1, 1, 1.077
2176, 1, 34, 0, 0.978
2176, 1, 34, 1, 0.978
2176, 2050, 1, 0, 1.061
2176, 2050, 1, 1, 1.061
2176, 2049, 2, 0, 0.971
2176, 2049, 2, 1, 0.971
2240, 0, 0, 0, 0.994
2240, 0, 0, 1, 0.994
2240, 3, 0, 0, 1.038
2240, 3, 0, 1, 1.039
2240, 35, 0, 0, 1.019
2240, 35, 0, 1, 1.019
2240, 0, 3, 0, 0.979
2240, 0, 3, 1, 0.98
2240, 0, 35, 0, 0.991
2240, 0, 35, 1, 0.991
2240, 3, 3, 0, 0.931
2240, 3, 3, 1, 0.931
2240, 35, 35, 0, 0.999
2240, 35, 35, 1, 0.999
2240, 2048, 0, 0, 0.995
2240, 2048, 0, 1, 0.995
2240, 2051, 0, 0, 1.039
2240, 2051, 0, 1, 1.039
2240, 2048, 3, 0, 0.799
2240, 2048, 3, 1, 0.799
2240, 2051, 3, 0, 0.889
2240, 2051, 3, 1, 0.889
2240, 3, 1, 0, 1.06
2240, 3, 1, 1, 1.06
2240, 1, 3, 0, 0.968
2240, 1, 3, 1, 0.968
2240, 35, 1, 0, 1.071
2240, 35, 1, 1, 1.071
2240, 1, 35, 0, 0.971
2240, 1, 35, 1, 0.971
2240, 2051, 1, 0, 1.057
2240, 2051, 1, 1, 1.057
2240, 2049, 3, 0, 0.966
2240, 2049, 3, 1, 0.966
2304, 0, 0, 0, 0.988
2304, 0, 0, 1, 0.988
2304, 4, 0, 0, 1.031
2304, 4, 0, 1, 1.032
2304, 36, 0, 0, 1.011
2304, 36, 0, 1, 1.011
2304, 0, 4, 0, 0.968
2304, 0, 4, 1, 0.967
2304, 0, 36, 0, 0.988
2304, 0, 36, 1, 0.988
2304, 4, 4, 0, 0.931
2304, 4, 4, 1, 0.931
2304, 36, 36, 0, 0.992
2304, 36, 36, 1, 0.992
2304, 2048, 0, 0, 0.988
2304, 2048, 0, 1, 0.988
2304, 2052, 0, 0, 1.032
2304, 2052, 0, 1, 1.032
2304, 2048, 4, 0, 0.793
2304, 2048, 4, 1, 0.793
2304, 2052, 4, 0, 0.884
2304, 2052, 4, 1, 0.884
2304, 4, 1, 0, 0.989
2304, 4, 1, 1, 0.989
2304, 1, 4, 0, 0.897
2304, 1, 4, 1, 0.898
2304, 36, 1, 0, 1.057
2304, 36, 1, 1, 1.057
2304, 1, 36, 0, 0.966
2304, 1, 36, 1, 0.966
2304, 2052, 1, 0, 1.052
2304, 2052, 1, 1, 1.052
2304, 2049, 4, 0, 0.955
2304, 2049, 4, 1, 0.955
2368, 0, 0, 0, 0.999
2368, 0, 0, 1, 1.0
2368, 5, 0, 0, 1.024
2368, 5, 0, 1, 1.025
2368, 37, 0, 0, 1.0
2368, 37, 0, 1, 1.0
2368, 0, 5, 0, 0.98
2368, 0, 5, 1, 0.981
2368, 0, 37, 0, 0.986
2368, 0, 37, 1, 0.981
2368, 5, 5, 0, 0.944
2368, 5, 5, 1, 0.944
2368, 37, 37, 0, 1.003
2368, 37, 37, 1, 1.003
2368, 2048, 0, 0, 1.002
2368, 2048, 0, 1, 1.002
2368, 2053, 0, 0, 1.025
2368, 2053, 0, 1, 1.025
2368, 2048, 5, 0, 0.801
2368, 2048, 5, 1, 0.801
2368, 2053, 5, 0, 0.907
2368, 2053, 5, 1, 0.907
2368, 5, 1, 0, 1.071
2368, 5, 1, 1, 1.071
2368, 1, 5, 0, 0.973
2368, 1, 5, 1, 0.973
2368, 37, 1, 0, 1.07
2368, 37, 1, 1, 1.07
2368, 1, 37, 0, 0.974
2368, 1, 37, 1, 0.974
2368, 2053, 1, 0, 1.065
2368, 2053, 1, 1, 1.065
2368, 2049, 5, 0, 0.967
2368, 2049, 5, 1, 0.967
2432, 0, 0, 0, 0.968
2432, 0, 0, 1, 1.002
2432, 6, 0, 0, 1.032
2432, 6, 0, 1, 1.033
2432, 38, 0, 0, 1.021
2432, 38, 0, 1, 1.021
2432, 0, 6, 0, 0.973
2432, 0, 6, 1, 0.976
2432, 0, 38, 0, 0.986
2432, 0, 38, 1, 0.986
2432, 6, 6, 0, 0.926
2432, 6, 6, 1, 0.926
2432, 38, 38, 0, 1.0
2432, 38, 38, 1, 1.0
2432, 2048, 0, 0, 1.005
2432, 2048, 0, 1, 1.004
2432, 2054, 0, 0, 1.032
2432, 2054, 0, 1, 1.033
2432, 2048, 6, 0, 0.797
2432, 2048, 6, 1, 0.797
2432, 2054, 6, 0, 0.898
2432, 2054, 6, 1, 0.898
2432, 6, 1, 0, 1.058
2432, 6, 1, 1, 1.058
2432, 1, 6, 0, 0.96
2432, 1, 6, 1, 0.96
2432, 38, 1, 0, 1.062
2432, 38, 1, 1, 1.062
2432, 1, 38, 0, 0.963
2432, 1, 38, 1, 0.963
2432, 2054, 1, 0, 1.054
2432, 2054, 1, 1, 1.054
2432, 2049, 6, 0, 0.957
2432, 2049, 6, 1, 0.957
2496, 0, 0, 0, 1.013
2496, 0, 0, 1, 1.013
2496, 7, 0, 0, 1.025
2496, 7, 0, 1, 1.026
2496, 39, 0, 0, 1.013
2496, 39, 0, 1, 1.013
2496, 0, 7, 0, 0.964
2496, 0, 7, 1, 0.966
2496, 0, 39, 0, 0.979
2496, 0, 39, 1, 0.979
2496, 7, 7, 0, 0.925
2496, 7, 7, 1, 0.925
2496, 39, 39, 0, 0.989
2496, 39, 39, 1, 0.989
2496, 2048, 0, 0, 1.013
2496, 2048, 0, 1, 1.013
2496, 2055, 0, 0, 1.026
2496, 2055, 0, 1, 1.026
2496, 2048, 7, 0, 0.792
2496, 2048, 7, 1, 0.792
2496, 2055, 7, 0, 0.93
2496, 2055, 7, 1, 0.93
2496, 7, 1, 0, 0.982
2496, 7, 1, 1, 0.982
2496, 1, 7, 0, 0.893
2496, 1, 7, 1, 0.893
2496, 39, 1, 0, 1.048
2496, 39, 1, 1, 1.049
2496, 1, 39, 0, 0.958
2496, 1, 39, 1, 0.958
2496, 2055, 1, 0, 1.042
2496, 2055, 1, 1, 1.042
2496, 2049, 7, 0, 0.947
2496, 2049, 7, 1, 0.947
2560, 0, 0, 0, 0.993
2560, 0, 0, 1, 0.993
2560, 8, 0, 0, 1.031
2560, 8, 0, 1, 1.032
2560, 40, 0, 0, 1.029
2560, 40, 0, 1, 1.029
2560, 0, 8, 0, 0.992
2560, 0, 8, 1, 0.992
2560, 0, 40, 0, 0.981
2560, 0, 40, 1, 0.98
2560, 8, 8, 0, 0.943
2560, 8, 8, 1, 0.942
2560, 40, 40, 0, 1.141
2560, 40, 40, 1, 1.141
2560, 2048, 0, 0, 0.993
2560, 2048, 0, 1, 0.993
2560, 2056, 0, 0, 1.032
2560, 2056, 0, 1, 1.032
2560, 2048, 8, 0, 0.812
2560, 2048, 8, 1, 0.812
2560, 2056, 8, 0, 0.912
2560, 2056, 8, 1, 0.912
2560, 8, 1, 0, 1.069
2560, 8, 1, 1, 1.069
2560, 1, 8, 0, 0.974
2560, 1, 8, 1, 0.974
2560, 40, 1, 0, 1.068
2560, 40, 1, 1, 1.068
2560, 1, 40, 0, 0.996
2560, 1, 40, 1, 0.996
2560, 2056, 1, 0, 1.063
2560, 2056, 1, 1, 1.063
2560, 2049, 8, 0, 0.969
2560, 2049, 8, 1, 0.969
2624, 0, 0, 0, 0.997
2624, 0, 0, 1, 0.997
2624, 9, 0, 0, 1.008
2624, 9, 0, 1, 1.012
2624, 41, 0, 0, 1.044
2624, 41, 0, 1, 1.044
2624, 0, 9, 0, 0.988
2624, 0, 9, 1, 0.99
2624, 0, 41, 0, 0.99
2624, 0, 41, 1, 0.99
2624, 9, 9, 0, 0.943
2624, 9, 9, 1, 0.943
2624, 41, 41, 0, 0.993
2624, 41, 41, 1, 0.993
2624, 2048, 0, 0, 0.998
2624, 2048, 0, 1, 0.998
2624, 2057, 0, 0, 1.012
2624, 2057, 0, 1, 1.012
2624, 2048, 9, 0, 0.81
2624, 2048, 9, 1, 0.81
2624, 2057, 9, 0, 0.907
2624, 2057, 9, 1, 0.907
2624, 9, 1, 0, 1.085
2624, 9, 1, 1, 1.084
2624, 1, 9, 0, 0.962
2624, 1, 9, 1, 0.963
2624, 41, 1, 0, 1.078
2624, 41, 1, 1, 1.078
2624, 1, 41, 0, 0.962
2624, 1, 41, 1, 0.962
2624, 2057, 1, 0, 1.081
2624, 2057, 1, 1, 1.081
2624, 2049, 9, 0, 0.959
2624, 2049, 9, 1, 0.959
2688, 0, 0, 0, 0.995
2688, 0, 0, 1, 0.995
2688, 10, 0, 0, 1.003
2688, 10, 0, 1, 1.006
2688, 42, 0, 0, 1.036
2688, 42, 0, 1, 1.036
2688, 0, 10, 0, 0.978
2688, 0, 10, 1, 0.979
2688, 0, 42, 0, 0.978
2688, 0, 42, 1, 0.977
2688, 10, 10, 0, 0.942
2688, 10, 10, 1, 0.942
2688, 42, 42, 0, 0.989
2688, 42, 42, 1, 0.989
2688, 2048, 0, 0, 0.995
2688, 2048, 0, 1, 0.995
2688, 2058, 0, 0, 1.006
2688, 2058, 0, 1, 1.006
2688, 2048, 10, 0, 0.804
2688, 2048, 10, 1, 0.804
2688, 2058, 10, 0, 0.905
2688, 2058, 10, 1, 0.905
2688, 10, 1, 0, 0.985
2688, 10, 1, 1, 0.985
2688, 1, 10, 0, 0.892
2688, 1, 10, 1, 0.892
2688, 42, 1, 0, 1.048
2688, 42, 1, 1, 1.048
2688, 1, 42, 0, 0.958
2688, 1, 42, 1, 0.958
2688, 2058, 1, 0, 1.046
2688, 2058, 1, 1, 1.046
2688, 2049, 10, 0, 0.948
2688, 2049, 10, 1, 0.948
2752, 0, 0, 0, 0.998
2752, 0, 0, 1, 0.993
2752, 11, 0, 0, 0.96
2752, 11, 0, 1, 0.96
2752, 43, 0, 0, 0.979
2752, 43, 0, 1, 0.979
2752, 0, 11, 0, 0.939
2752, 0, 11, 1, 0.939
2752, 0, 43, 0, 0.93
2752, 0, 43, 1, 0.93
2752, 11, 11, 0, 0.949
2752, 11, 11, 1, 0.949
2752, 43, 43, 0, 1.007
2752, 43, 43, 1, 1.007
2752, 2048, 0, 0, 0.993
2752, 2048, 0, 1, 0.994
2752, 2059, 0, 0, 0.96
2752, 2059, 0, 1, 0.96
2752, 2048, 11, 0, 0.77
2752, 2048, 11, 1, 0.77
2752, 2059, 11, 0, 0.916
2752, 2059, 11, 1, 0.916
2752, 11, 1, 0, 1.0
2752, 11, 1, 1, 1.0
2752, 1, 11, 0, 0.933
2752, 1, 11, 1, 0.933
2752, 43, 1, 0, 1.028
2752, 43, 1, 1, 1.028
2752, 1, 43, 0, 0.925
2752, 1, 43, 1, 0.925
2752, 2059, 1, 0, 0.995
2752, 2059, 1, 1, 0.995
2752, 2049, 11, 0, 0.929
2752, 2049, 11, 1, 0.929
2816, 0, 0, 0, 1.004
2816, 0, 0, 1, 1.004
2816, 12, 0, 0, 0.897
2816, 12, 0, 1, 0.894
2816, 44, 0, 0, 0.914
2816, 44, 0, 1, 0.914
2816, 0, 12, 0, 0.877
2816, 0, 12, 1, 0.874
2816, 0, 44, 0, 0.871
2816, 0, 44, 1, 0.87
2816, 12, 12, 0, 0.948
2816, 12, 12, 1, 0.948
2816, 44, 44, 0, 1.009
2816, 44, 44, 1, 1.009
2816, 2048, 0, 0, 1.005
2816, 2048, 0, 1, 1.005
2816, 2060, 0, 0, 0.894
2816, 2060, 0, 1, 0.894
2816, 2048, 12, 0, 0.715
2816, 2048, 12, 1, 0.713
2816, 2060, 12, 0, 0.915
2816, 2060, 12, 1, 0.915
2816, 12, 1, 0, 0.918
2816, 12, 1, 1, 0.917
2816, 1, 12, 0, 0.863
2816, 1, 12, 1, 0.863
2816, 44, 1, 0, 0.944
2816, 44, 1, 1, 0.943
2816, 1, 44, 0, 0.861
2816, 1, 44, 1, 0.861
2816, 2060, 1, 0, 0.919
2816, 2060, 1, 1, 0.924
2816, 2049, 12, 0, 0.86
2816, 2049, 12, 1, 0.86
2880, 0, 0, 0, 0.989
2880, 0, 0, 1, 0.989
2880, 13, 0, 0, 0.967
2880, 13, 0, 1, 0.967
2880, 45, 0, 0, 0.987
2880, 45, 0, 1, 0.987
2880, 0, 13, 0, 0.925
2880, 0, 13, 1, 0.925
2880, 0, 45, 0, 0.927
2880, 0, 45, 1, 0.927
2880, 13, 13, 0, 0.944
2880, 13, 13, 1, 0.944
2880, 45, 45, 0, 1.003
2880, 45, 45, 1, 1.003
2880, 2048, 0, 0, 0.989
2880, 2048, 0, 1, 0.989
2880, 2061, 0, 0, 0.967
2880, 2061, 0, 1, 0.967
2880, 2048, 13, 0, 0.76
2880, 2048, 13, 1, 0.76
2880, 2061, 13, 0, 0.91
2880, 2061, 13, 1, 0.91
2880, 13, 1, 0, 0.922
2880, 13, 1, 1, 0.922
2880, 1, 13, 0, 0.859
2880, 1, 13, 1, 0.859
2880, 45, 1, 0, 1.013
2880, 45, 1, 1, 1.013
2880, 1, 45, 0, 0.92
2880, 1, 45, 1, 0.92
2880, 2061, 1, 0, 0.984
2880, 2061, 1, 1, 0.984
2880, 2049, 13, 0, 0.918
2880, 2049, 13, 1, 0.918
2944, 0, 0, 0, 1.014
2944, 0, 0, 1, 1.015
2944, 14, 0, 0, 0.961
2944, 14, 0, 1, 0.961
2944, 46, 0, 0, 0.979
2944, 46, 0, 1, 0.979
2944, 0, 14, 0, 0.934
2944, 0, 14, 1, 0.937
2944, 0, 46, 0, 0.924
2944, 0, 46, 1, 0.921
2944, 14, 14, 0, 0.953
2944, 14, 14, 1, 0.953
2944, 46, 46, 0, 1.009
2944, 46, 46, 1, 1.009
2944, 2048, 0, 0, 1.015
2944, 2048, 0, 1, 1.015
2944, 2062, 0, 0, 0.961
2944, 2062, 0, 1, 0.961
2944, 2048, 14, 0, 0.769
2944, 2048, 14, 1, 0.769
2944, 2062, 14, 0, 0.923
2944, 2062, 14, 1, 0.923
2944, 14, 1, 0, 0.999
2944, 14, 1, 1, 0.999
2944, 1, 14, 0, 0.927
2944, 1, 14, 1, 0.927
2944, 46, 1, 0, 1.027
2944, 46, 1, 1, 1.027
2944, 1, 46, 0, 0.918
2944, 1, 46, 1, 0.918
2944, 2062, 1, 0, 0.995
2944, 2062, 1, 1, 0.995
2944, 2049, 14, 0, 0.922
2944, 2049, 14, 1, 0.922
3008, 0, 0, 0, 0.998
3008, 0, 0, 1, 0.997
3008, 15, 0, 0, 0.953
3008, 15, 0, 1, 0.953
3008, 47, 0, 0, 0.996
3008, 47, 0, 1, 0.996
3008, 0, 15, 0, 0.933
3008, 0, 15, 1, 0.929
3008, 0, 47, 0, 0.933
3008, 0, 47, 1, 0.933
3008, 15, 15, 0, 0.95
3008, 15, 15, 1, 0.949
3008, 47, 47, 0, 1.003
3008, 47, 47, 1, 1.003
3008, 2048, 0, 0, 0.998
3008, 2048, 0, 1, 0.998
3008, 2063, 0, 0, 0.953
3008, 2063, 0, 1, 0.953
3008, 2048, 15, 0, 0.766
3008, 2048, 15, 1, 0.766
3008, 2063, 15, 0, 0.916
3008, 2063, 15, 1, 0.916
3008, 15, 1, 0, 0.996
3008, 15, 1, 1, 0.996
3008, 1, 15, 0, 0.927
3008, 1, 15, 1, 0.927
3008, 47, 1, 0, 1.026
3008, 47, 1, 1, 1.026
3008, 1, 47, 0, 0.918
3008, 1, 47, 1, 0.918
3008, 2063, 1, 0, 0.994
3008, 2063, 1, 1, 0.994
3008, 2049, 15, 0, 0.925
3008, 2049, 15, 1, 0.925
3072, 0, 0, 0, 1.015
3072, 0, 0, 1, 1.016
3072, 16, 0, 0, 1.045
3072, 16, 0, 1, 1.045
3072, 48, 0, 0, 1.045
3072, 48, 0, 1, 1.045
3072, 0, 16, 0, 1.049
3072, 0, 16, 1, 1.049
3072, 0, 48, 0, 1.049
3072, 0, 48, 1, 1.049
3072, 16, 16, 0, 1.016
3072, 16, 16, 1, 1.015
3072, 48, 48, 0, 1.015
3072, 48, 48, 1, 1.016
3072, 2048, 0, 0, 1.016
3072, 2048, 0, 1, 1.016
3072, 2064, 0, 0, 1.045
3072, 2064, 0, 1, 1.045
3072, 2048, 16, 0, 1.049
3072, 2048, 16, 1, 1.049
3072, 2064, 16, 0, 1.016
3072, 2064, 16, 1, 1.016
3072, 16, 1, 0, 0.815
3072, 16, 1, 1, 0.815
3072, 1, 16, 0, 0.872
3072, 1, 16, 1, 0.872
3072, 48, 1, 0, 1.017
3072, 48, 1, 1, 1.017
3072, 1, 48, 0, 0.872
3072, 1, 48, 1, 0.872
3072, 2064, 1, 0, 0.815
3072, 2064, 1, 1, 0.815
3072, 2049, 16, 0, 0.872
3072, 2049, 16, 1, 0.872
3136, 0, 0, 0, 0.995
3136, 0, 0, 1, 0.996
3136, 17, 0, 0, 0.949
3136, 17, 0, 1, 0.949
3136, 49, 0, 0, 0.987
3136, 49, 0, 1, 0.987
3136, 0, 17, 0, 0.922
3136, 0, 17, 1, 0.919
3136, 0, 49, 0, 0.931
3136, 0, 49, 1, 0.931
3136, 17, 17, 0, 1.122
3136, 17, 17, 1, 1.119
3136, 49, 49, 0, 0.987
3136, 49, 49, 1, 0.987
3136, 2048, 0, 0, 0.997
3136, 2048, 0, 1, 0.997
3136, 2065, 0, 0, 0.949
3136, 2065, 0, 1, 0.949
3136, 2048, 17, 0, 0.896
3136, 2048, 17, 1, 0.896
3136, 2065, 17, 0, 1.122
3136, 2065, 17, 1, 1.12
3136, 17, 1, 0, 1.185
3136, 17, 1, 1, 1.185
3136, 1, 17, 0, 1.124
3136, 1, 17, 1, 1.124
3136, 49, 1, 0, 1.11
3136, 49, 1, 1, 1.109
3136, 1, 49, 0, 1.044
3136, 1, 49, 1, 1.044
3136, 2065, 1, 0, 1.147
3136, 2065, 1, 1, 1.147
3136, 2049, 17, 0, 1.103
3136, 2049, 17, 1, 1.103
3200, 0, 0, 0, 1.006
3200, 0, 0, 1, 1.006
3200, 18, 0, 0, 0.978
3200, 18, 0, 1, 0.978
3200, 50, 0, 0, 0.998
3200, 50, 0, 1, 0.998
3200, 0, 18, 0, 0.932
3200, 0, 18, 1, 0.932
3200, 0, 50, 0, 0.93
3200, 0, 50, 1, 0.93
3200, 18, 18, 0, 1.11
3200, 18, 18, 1, 1.11
3200, 50, 50, 0, 0.994
3200, 50, 50, 1, 0.994
3200, 2048, 0, 0, 1.007
3200, 2048, 0, 1, 1.007
3200, 2066, 0, 0, 0.978
3200, 2066, 0, 1, 0.978
3200, 2048, 18, 0, 0.894
3200, 2048, 18, 1, 0.894
3200, 2066, 18, 0, 1.11
3200, 2066, 18, 1, 1.11
3200, 18, 1, 0, 1.002
3200, 18, 1, 1, 1.002
3200, 1, 18, 0, 0.917
3200, 1, 18, 1, 0.917
3200, 50, 1, 0, 0.963
3200, 50, 1, 1, 0.964
3200, 1, 50, 0, 0.888
3200, 1, 50, 1, 0.888
3200, 2066, 1, 0, 1.002
3200, 2066, 1, 1, 1.002
3200, 2049, 18, 0, 0.914
3200, 2049, 18, 1, 0.914
3264, 0, 0, 0, 0.994
3264, 0, 0, 1, 0.994
3264, 19, 0, 0, 0.959
3264, 19, 0, 1, 0.959
3264, 51, 0, 0, 0.994
3264, 51, 0, 1, 0.994
3264, 0, 19, 0, 0.927
3264, 0, 19, 1, 0.927
3264, 0, 51, 0, 0.927
3264, 0, 51, 1, 0.927
3264, 19, 19, 0, 1.1
3264, 19, 19, 1, 1.099
3264, 51, 51, 0, 0.982
3264, 51, 51, 1, 0.982
3264, 2048, 0, 0, 0.994
3264, 2048, 0, 1, 0.994
3264, 2067, 0, 0, 0.959
3264, 2067, 0, 1, 0.959
3264, 2048, 19, 0, 0.891
3264, 2048, 19, 1, 0.891
3264, 2067, 19, 0, 1.099
3264, 2067, 19, 1, 1.099
3264, 19, 1, 0, 0.977
3264, 19, 1, 1, 0.976
3264, 1, 19, 0, 0.921
3264, 1, 19, 1, 0.921
3264, 51, 1, 0, 0.959
3264, 51, 1, 1, 0.959
3264, 1, 51, 0, 0.886
3264, 1, 51, 1, 0.886
3264, 2067, 1, 0, 0.976
3264, 2067, 1, 1, 0.976
3264, 2049, 19, 0, 0.917
3264, 2049, 19, 1, 0.917
3328, 0, 0, 0, 0.997
3328, 0, 0, 1, 0.993
3328, 20, 0, 0, 0.955
3328, 20, 0, 1, 0.955
3328, 52, 0, 0, 0.99
3328, 52, 0, 1, 0.99
3328, 0, 20, 0, 0.925
3328, 0, 20, 1, 0.927
3328, 0, 52, 0, 0.933
3328, 0, 52, 1, 0.933
3328, 20, 20, 0, 1.11
3328, 20, 20, 1, 1.11
3328, 52, 52, 0, 0.988
3328, 52, 52, 1, 0.988
3328, 2048, 0, 0, 0.996
3328, 2048, 0, 1, 0.993
3328, 2068, 0, 0, 0.955
3328, 2068, 0, 1, 0.955
3328, 2048, 20, 0, 0.9
3328, 2048, 20, 1, 0.9
3328, 2068, 20, 0, 1.109
3328, 2068, 20, 1, 1.109
3328, 20, 1, 0, 0.996
3328, 20, 1, 1, 0.996
3328, 1, 20, 0, 0.927
3328, 1, 20, 1, 0.927
3328, 52, 1, 0, 0.972
3328, 52, 1, 1, 0.972
3328, 1, 52, 0, 0.901
3328, 1, 52, 1, 0.901
3328, 2068, 1, 0, 0.996
3328, 2068, 1, 1, 0.996
3328, 2049, 20, 0, 0.924
3328, 2049, 20, 1, 0.924
3392, 0, 0, 0, 0.996
3392, 0, 0, 1, 1.0
3392, 21, 0, 0, 0.964
3392, 21, 0, 1, 0.964
3392, 53, 0, 0, 0.999
3392, 53, 0, 1, 0.999
3392, 0, 21, 0, 0.932
3392, 0, 21, 1, 0.932
3392, 0, 53, 0, 0.93
3392, 0, 53, 1, 0.93
3392, 21, 21, 0, 1.113
3392, 21, 21, 1, 1.113
3392, 53, 53, 0, 0.983
3392, 53, 53, 1, 0.983
3392, 2048, 0, 0, 1.0
3392, 2048, 0, 1, 1.0
3392, 2069, 0, 0, 0.964
3392, 2069, 0, 1, 0.964
3392, 2048, 21, 0, 0.896
3392, 2048, 21, 1, 0.896
3392, 2069, 21, 0, 1.113
3392, 2069, 21, 1, 1.113
3392, 21, 1, 0, 0.994
3392, 21, 1, 1, 0.994
3392, 1, 21, 0, 0.918
3392, 1, 21, 1, 0.918
3392, 53, 1, 0, 0.972
3392, 53, 1, 1, 0.972
3392, 1, 53, 0, 0.891
3392, 1, 53, 1, 0.891
3392, 2069, 1, 0, 0.994
3392, 2069, 1, 1, 0.994
3392, 2049, 21, 0, 0.915
3392, 2049, 21, 1, 0.915
3456, 0, 0, 0, 0.995
3456, 0, 0, 1, 0.995
3456, 22, 0, 0, 0.965
3456, 22, 0, 1, 0.965
3456, 54, 0, 0, 0.996
3456, 54, 0, 1, 0.996
3456, 0, 22, 0, 0.927
3456, 0, 22, 1, 0.927
3456, 0, 54, 0, 0.927
3456, 0, 54, 1, 0.927
3456, 22, 22, 0, 1.106
3456, 22, 22, 1, 1.107
3456, 54, 54, 0, 0.98
3456, 54, 54, 1, 0.98
3456, 2048, 0, 0, 0.995
3456, 2048, 0, 1, 0.995
3456, 2070, 0, 0, 0.965
3456, 2070, 0, 1, 0.965
3456, 2048, 22, 0, 0.893
3456, 2048, 22, 1, 0.893
3456, 2070, 22, 0, 1.107
3456, 2070, 22, 1, 1.107
3456, 22, 1, 0, 0.988
3456, 22, 1, 1, 0.988
3456, 1, 22, 0, 0.915
3456, 1, 22, 1, 0.915
3456, 54, 1, 0, 0.963
3456, 54, 1, 1, 0.963
3456, 1, 54, 0, 0.887
3456, 1, 54, 1, 0.887
3456, 2070, 1, 0, 0.988
3456, 2070, 1, 1, 0.988
3456, 2049, 22, 0, 0.911
3456, 2049, 22, 1, 0.911
3520, 0, 0, 0, 1.016
3520, 0, 0, 1, 1.016
3520, 23, 0, 0, 0.957
3520, 23, 0, 1, 0.957
3520, 55, 0, 0, 0.991
3520, 55, 0, 1, 0.991
3520, 0, 23, 0, 0.918
3520, 0, 23, 1, 0.929
3520, 0, 55, 0, 0.935
3520, 0, 55, 1, 0.934
3520, 23, 23, 0, 1.111
3520, 23, 23, 1, 1.111
3520, 55, 55, 0, 0.994
3520, 55, 55, 1, 0.994
3520, 2048, 0, 0, 1.016
3520, 2048, 0, 1, 1.016
3520, 2071, 0, 0, 0.957
3520, 2071, 0, 1, 0.957
3520, 2048, 23, 0, 0.903
3520, 2048, 23, 1, 0.902
3520, 2071, 23, 0, 1.111
3520, 2071, 23, 1, 1.111
3520, 23, 1, 0, 0.997
3520, 23, 1, 1, 0.997
3520, 1, 23, 0, 0.926
3520, 1, 23, 1, 0.927
3520, 55, 1, 0, 0.976
3520, 55, 1, 1, 0.976
3520, 1, 55, 0, 0.902
3520, 1, 55, 1, 0.902
3520, 2071, 1, 0, 0.997
3520, 2071, 1, 1, 0.997
3520, 2049, 23, 0, 0.924
3520, 2049, 23, 1, 0.924
3584, 0, 0, 0, 1.005
3584, 0, 0, 1, 1.004
3584, 24, 0, 0, 0.985
3584, 24, 0, 1, 0.979
3584, 56, 0, 0, 1.006
3584, 56, 0, 1, 1.006
3584, 0, 24, 0, 0.931
3584, 0, 24, 1, 0.931
3584, 0, 56, 0, 0.93
3584, 0, 56, 1, 0.93
3584, 24, 24, 0, 1.111
3584, 24, 24, 1, 1.11
3584, 56, 56, 0, 1.102
3584, 56, 56, 1, 1.101
3584, 2048, 0, 0, 1.006
3584, 2048, 0, 1, 1.005
3584, 2072, 0, 0, 0.983
3584, 2072, 0, 1, 0.977
3584, 2048, 24, 0, 0.896
3584, 2048, 24, 1, 0.897
3584, 2072, 24, 0, 1.111
3584, 2072, 24, 1, 1.111
3584, 24, 1, 0, 1.004
3584, 24, 1, 1, 1.004
3584, 1, 24, 0, 0.921
3584, 1, 24, 1, 0.921
3584, 56, 1, 0, 0.97
3584, 56, 1, 1, 0.97
3584, 1, 56, 0, 0.891
3584, 1, 56, 1, 0.891
3584, 2072, 1, 0, 1.004
3584, 2072, 1, 1, 1.004
3584, 2049, 24, 0, 0.918
3584, 2049, 24, 1, 0.918
3648, 0, 0, 0, 1.012
3648, 0, 0, 1, 1.012
3648, 25, 0, 0, 0.96
3648, 25, 0, 1, 0.96
3648, 57, 0, 0, 0.988
3648, 57, 0, 1, 0.988
3648, 0, 25, 0, 0.927
3648, 0, 25, 1, 0.927
3648, 0, 57, 0, 0.927
3648, 0, 57, 1, 0.927
3648, 25, 25, 0, 1.1
3648, 25, 25, 1, 1.1
3648, 57, 57, 0, 0.986
3648, 57, 57, 1, 0.986
3648, 2048, 0, 0, 1.012
3648, 2048, 0, 1, 1.012
3648, 2073, 0, 0, 0.96
3648, 2073, 0, 1, 0.96
3648, 2048, 25, 0, 0.895
3648, 2048, 25, 1, 0.894
3648, 2073, 25, 0, 1.103
3648, 2073, 25, 1, 1.103
3648, 25, 1, 0, 1.032
3648, 25, 1, 1, 1.032
3648, 1, 25, 0, 0.9
3648, 1, 25, 1, 0.901
3648, 57, 1, 0, 0.974
3648, 57, 1, 1, 0.974
3648, 1, 57, 0, 0.888
3648, 1, 57, 1, 0.888
3648, 2073, 1, 0, 1.032
3648, 2073, 1, 1, 1.032
3648, 2049, 25, 0, 0.895
3648, 2049, 25, 1, 0.896
3712, 0, 0, 0, 0.996
3712, 0, 0, 1, 0.996
3712, 26, 0, 0, 0.959
3712, 26, 0, 1, 0.959
3712, 58, 0, 0, 0.995
3712, 58, 0, 1, 0.995
3712, 0, 26, 0, 0.92
3712, 0, 26, 1, 0.919
3712, 0, 58, 0, 0.931
3712, 0, 58, 1, 0.931
3712, 26, 26, 0, 1.103
3712, 26, 26, 1, 1.101
3712, 58, 58, 0, 0.99
3712, 58, 58, 1, 0.989
3712, 2048, 0, 0, 0.997
3712, 2048, 0, 1, 0.997
3712, 2074, 0, 0, 0.959
3712, 2074, 0, 1, 0.959
3712, 2048, 26, 0, 0.901
3712, 2048, 26, 1, 0.901
3712, 2074, 26, 0, 1.103
3712, 2074, 26, 1, 1.103
3712, 26, 1, 0, 1.001
3712, 26, 1, 1, 1.001
3712, 1, 26, 0, 0.928
3712, 1, 26, 1, 0.928
3712, 58, 1, 0, 0.974
3712, 58, 1, 1, 0.974
3712, 1, 58, 0, 0.903
3712, 1, 58, 1, 0.902
3712, 2074, 1, 0, 1.001
3712, 2074, 1, 1, 1.001
3712, 2049, 26, 0, 0.925
3712, 2049, 26, 1, 0.925
3776, 0, 0, 0, 1.003
3776, 0, 0, 1, 1.003
3776, 27, 0, 0, 0.964
3776, 27, 0, 1, 0.963
3776, 59, 0, 0, 1.004
3776, 59, 0, 1, 1.004
3776, 0, 27, 0, 0.931
3776, 0, 27, 1, 0.931
3776, 0, 59, 0, 0.929
3776, 0, 59, 1, 0.929
3776, 27, 27, 0, 1.097
3776, 27, 27, 1, 1.097
3776, 59, 59, 0, 0.992
3776, 59, 59, 1, 0.992
3776, 2048, 0, 0, 1.003
3776, 2048, 0, 1, 1.003
3776, 2075, 0, 0, 0.964
3776, 2075, 0, 1, 0.963
3776, 2048, 27, 0, 0.898
3776, 2048, 27, 1, 0.898
3776, 2075, 27, 0, 1.097
3776, 2075, 27, 1, 1.097
3776, 27, 1, 0, 0.991
3776, 27, 1, 1, 0.991
3776, 1, 27, 0, 0.919
3776, 1, 27, 1, 0.919
3776, 59, 1, 0, 0.979
3776, 59, 1, 1, 0.979
3776, 1, 59, 0, 0.894
3776, 1, 59, 1, 0.894
3776, 2075, 1, 0, 0.991
3776, 2075, 1, 1, 0.991
3776, 2049, 27, 0, 0.916
3776, 2049, 27, 1, 0.917
3840, 0, 0, 0, 0.998
3840, 0, 0, 1, 0.998
3840, 28, 0, 0, 0.968
3840, 28, 0, 1, 0.968
3840, 60, 0, 0, 1.001
3840, 60, 0, 1, 1.001
3840, 0, 28, 0, 0.927
3840, 0, 28, 1, 0.927
3840, 0, 60, 0, 0.927
3840, 0, 60, 1, 0.927
3840, 28, 28, 0, 1.094
3840, 28, 28, 1, 1.094
3840, 60, 60, 0, 0.982
3840, 60, 60, 1, 0.982
3840, 2048, 0, 0, 0.998
3840, 2048, 0, 1, 0.998
3840, 2076, 0, 0, 0.968
3840, 2076, 0, 1, 0.968
3840, 2048, 28, 0, 0.896
3840, 2048, 28, 1, 0.896
3840, 2076, 28, 0, 1.094
3840, 2076, 28, 1, 1.094
3840, 28, 1, 0, 0.99
3840, 28, 1, 1, 0.99
3840, 1, 28, 0, 0.91
3840, 1, 28, 1, 0.91
3840, 60, 1, 0, 0.969
3840, 60, 1, 1, 0.969
3840, 1, 60, 0, 0.89
3840, 1, 60, 1, 0.891
3840, 2076, 1, 0, 0.99
3840, 2076, 1, 1, 0.99
3840, 2049, 28, 0, 0.906
3840, 2049, 28, 1, 0.906
3904, 0, 0, 0, 1.001
3904, 0, 0, 1, 0.998
3904, 29, 0, 0, 0.961
3904, 29, 0, 1, 0.961
3904, 61, 0, 0, 0.997
3904, 61, 0, 1, 0.997
3904, 0, 29, 0, 0.92
3904, 0, 29, 1, 0.926
3904, 0, 61, 0, 0.933
3904, 0, 61, 1, 0.933
3904, 29, 29, 0, 1.103
3904, 29, 29, 1, 1.103
3904, 61, 61, 0, 0.995
3904, 61, 61, 1, 0.995
3904, 2048, 0, 0, 0.998
3904, 2048, 0, 1, 0.998
3904, 2077, 0, 0, 0.961
3904, 2077, 0, 1, 0.961
3904, 2048, 29, 0, 0.904
3904, 2048, 29, 1, 0.904
3904, 2077, 29, 0, 1.102
3904, 2077, 29, 1, 1.102
3904, 29, 1, 0, 1.0
3904, 29, 1, 1, 1.0
3904, 1, 29, 0, 0.911
3904, 1, 29, 1, 0.911
3904, 61, 1, 0, 0.98
3904, 61, 1, 1, 0.98
3904, 1, 61, 0, 0.904
3904, 1, 61, 1, 0.904
3904, 2077, 1, 0, 1.0
3904, 2077, 1, 1, 1.0
3904, 2049, 29, 0, 0.906
3904, 2049, 29, 1, 0.907
3968, 0, 0, 0, 1.003
3968, 0, 0, 1, 1.003
3968, 30, 0, 0, 0.969
3968, 30, 0, 1, 0.969
3968, 62, 0, 0, 1.005
3968, 62, 0, 1, 1.006
3968, 0, 30, 0, 0.931
3968, 0, 30, 1, 0.931
3968, 0, 62, 0, 0.93
3968, 0, 62, 1, 0.93
3968, 30, 30, 0, 1.103
3968, 30, 30, 1, 1.103
3968, 62, 62, 0, 0.99
3968, 62, 62, 1, 0.99
3968, 2048, 0, 0, 1.004
3968, 2048, 0, 1, 1.004
3968, 2078, 0, 0, 0.968
3968, 2078, 0, 1, 0.969
3968, 2048, 30, 0, 0.899
3968, 2048, 30, 1, 0.899
3968, 2078, 30, 0, 1.105
3968, 2078, 30, 1, 1.105
3968, 30, 1, 0, 0.993
3968, 30, 1, 1, 0.993
3968, 1, 30, 0, 0.914
3968, 1, 30, 1, 0.913
3968, 62, 1, 0, 0.978
3968, 62, 1, 1, 0.978
3968, 1, 62, 0, 0.895
3968, 1, 62, 1, 0.895
3968, 2078, 1, 0, 0.993
3968, 2078, 1, 1, 0.993
3968, 2049, 30, 0, 0.911
3968, 2049, 30, 1, 0.911
4032, 0, 0, 0, 0.995
4032, 0, 0, 1, 0.995
4032, 31, 0, 0, 0.967
4032, 31, 0, 1, 0.967
4032, 63, 0, 0, 1.003
4032, 63, 0, 1, 1.002
4032, 0, 31, 0, 0.927
4032, 0, 31, 1, 0.927
4032, 0, 63, 0, 0.927
4032, 0, 63, 1, 0.927
4032, 31, 31, 0, 1.09
4032, 31, 31, 1, 1.09
4032, 63, 63, 0, 0.987
4032, 63, 63, 1, 0.987
4032, 2048, 0, 0, 0.995
4032, 2048, 0, 1, 0.995
4032, 2079, 0, 0, 0.967
4032, 2079, 0, 1, 0.967
4032, 2048, 31, 0, 0.897
4032, 2048, 31, 1, 0.897
4032, 2079, 31, 0, 1.09
4032, 2079, 31, 1, 1.09
4032, 31, 1, 0, 0.989
4032, 31, 1, 1, 0.989
4032, 1, 31, 0, 0.922
4032, 1, 31, 1, 0.923
4032, 63, 1, 0, 0.971
4032, 63, 1, 1, 0.972
4032, 1, 63, 0, 0.892
4032, 1, 63, 1, 0.892
4032, 2079, 1, 0, 0.988
4032, 2079, 1, 1, 0.988
4032, 2049, 31, 0, 0.919
4032, 2049, 31, 1, 0.919
4096, 32, 0, 0, 1.014
4096, 32, 0, 1, 1.014
4096, 64, 0, 0, 1.014
4096, 64, 0, 1, 1.014
4096, 0, 32, 0, 1.013
4096, 0, 32, 1, 1.013
4096, 0, 64, 0, 1.013
4096, 0, 64, 1, 1.013
4096, 32, 32, 0, 1.014
4096, 32, 32, 1, 1.014
4096, 64, 64, 0, 1.014
4096, 64, 64, 1, 1.014
4096, 2080, 0, 0, 1.014
4096, 2080, 0, 1, 1.014
4096, 2048, 32, 0, 1.014
4096, 2048, 32, 1, 1.014
4096, 2080, 32, 0, 1.014
4096, 2080, 32, 1, 1.014
4096, 32, 1, 0, 0.975
4096, 32, 1, 1, 0.975
4096, 1, 32, 0, 0.769
4096, 1, 32, 1, 0.769
4096, 64, 1, 0, 0.858
4096, 64, 1, 1, 0.858
4096, 1, 64, 0, 0.769
4096, 1, 64, 1, 0.769
4096, 2080, 1, 0, 0.829
4096, 2080, 1, 1, 0.829
4096, 2049, 32, 0, 0.886
4096, 2049, 32, 1, 0.886
4160, 0, 0, 0, 1.003
4160, 0, 0, 1, 1.003
4160, 33, 0, 0, 1.004
4160, 33, 0, 1, 1.004
4160, 65, 0, 0, 0.999
4160, 65, 0, 1, 0.999
4160, 0, 33, 0, 0.931
4160, 0, 33, 1, 0.931
4160, 0, 65, 0, 0.765
4160, 0, 65, 1, 0.765
4160, 33, 33, 0, 0.998
4160, 33, 33, 1, 0.998
4160, 65, 65, 0, 0.942
4160, 65, 65, 1, 0.942
4160, 2048, 0, 0, 1.003
4160, 2048, 0, 1, 1.003
4160, 2081, 0, 0, 1.005
4160, 2081, 0, 1, 1.005
4160, 2048, 33, 0, 0.899
4160, 2048, 33, 1, 0.899
4160, 2081, 33, 0, 1.002
4160, 2081, 33, 1, 1.002
4160, 33, 1, 0, 1.114
4160, 33, 1, 1, 1.114
4160, 1, 33, 0, 1.01
4160, 1, 33, 1, 1.01
4160, 65, 1, 0, 1.077
4160, 65, 1, 1, 1.077
4160, 1, 65, 0, 0.935
4160, 1, 65, 1, 0.936
4160, 2081, 1, 0, 1.077
4160, 2081, 1, 1, 1.077
4160, 2049, 33, 0, 1.008
4160, 2049, 33, 1, 1.007
4224, 0, 0, 0, 1.014
4224, 0, 0, 1, 1.014
4224, 34, 0, 0, 1.0
4224, 34, 0, 1, 1.0
4224, 66, 0, 0, 1.001
4224, 66, 0, 1, 1.001
4224, 0, 34, 0, 0.928
4224, 0, 34, 1, 0.928
4224, 0, 66, 0, 0.762
4224, 0, 66, 1, 0.762
4224, 34, 34, 0, 0.998
4224, 34, 34, 1, 0.998
4224, 66, 66, 0, 0.959
4224, 66, 66, 1, 0.959
4224, 2048, 0, 0, 1.014
4224, 2048, 0, 1, 1.014
4224, 2082, 0, 0, 1.001
4224, 2082, 0, 1, 1.001
4224, 2048, 34, 0, 0.899
4224, 2048, 34, 1, 0.898
4224, 2082, 34, 0, 0.998
4224, 2082, 34, 1, 0.997
4224, 34, 1, 0, 1.024
4224, 34, 1, 1, 1.024
4224, 1, 34, 0, 0.923
4224, 1, 34, 1, 0.923
4224, 66, 1, 0, 1.013
4224, 66, 1, 1, 1.013
4224, 1, 66, 0, 0.917
4224, 1, 66, 1, 0.917
4224, 2082, 1, 0, 1.022
4224, 2082, 1, 1, 1.022
4224, 2049, 34, 0, 0.92
4224, 2049, 34, 1, 0.92
4288, 0, 0, 0, 0.999
4288, 0, 0, 1, 0.999
4288, 35, 0, 0, 0.995
4288, 35, 0, 1, 0.996
4288, 67, 0, 0, 0.998
4288, 67, 0, 1, 0.998
4288, 0, 35, 0, 0.917
4288, 0, 35, 1, 0.919
4288, 0, 67, 0, 0.767
4288, 0, 67, 1, 0.767
4288, 35, 35, 0, 1.004
4288, 35, 35, 1, 1.004
4288, 67, 67, 0, 0.995
4288, 67, 67, 1, 0.995
4288, 2048, 0, 0, 0.999
4288, 2048, 0, 1, 0.999
4288, 2083, 0, 0, 0.995
4288, 2083, 0, 1, 0.995
4288, 2048, 35, 0, 0.905
4288, 2048, 35, 1, 0.904
4288, 2083, 35, 0, 1.004
4288, 2083, 35, 1, 1.004
4288, 35, 1, 0, 1.032
4288, 35, 1, 1, 1.033
4288, 1, 35, 0, 0.928
4288, 1, 35, 1, 0.928
4288, 67, 1, 0, 1.019
4288, 67, 1, 1, 1.019
4288, 1, 67, 0, 0.924
4288, 1, 67, 1, 0.924
4288, 2083, 1, 0, 1.03
4288, 2083, 1, 1, 1.031
4288, 2049, 35, 0, 0.925
4288, 2049, 35, 1, 0.925
4352, 0, 0, 0, 1.005
4352, 0, 0, 1, 1.006
4352, 36, 0, 0, 1.006
4352, 36, 0, 1, 1.007
4352, 68, 0, 0, 1.006
4352, 68, 0, 1, 1.007
4352, 0, 36, 0, 0.929
4352, 0, 36, 1, 0.928
4352, 0, 68, 0, 0.766
4352, 0, 68, 1, 0.765
4352, 36, 36, 0, 0.998
4352, 36, 36, 1, 0.998
4352, 68, 68, 0, 0.964
4352, 68, 68, 1, 0.964
4352, 2048, 0, 0, 1.006
4352, 2048, 0, 1, 1.006
4352, 2084, 0, 0, 1.007
4352, 2084, 0, 1, 1.007
4352, 2048, 36, 0, 0.897
4352, 2048, 36, 1, 0.898
4352, 2084, 36, 0, 0.998
4352, 2084, 36, 1, 0.998
4352, 36, 1, 0, 1.031
4352, 36, 1, 1, 1.031
4352, 1, 36, 0, 0.924
4352, 1, 36, 1, 0.925
4352, 68, 1, 0, 0.999
4352, 68, 1, 1, 0.999
4352, 1, 68, 0, 0.922
4352, 1, 68, 1, 0.922
4352, 2084, 1, 0, 1.032
4352, 2084, 1, 1, 1.03
4352, 2049, 36, 0, 0.923
4352, 2049, 36, 1, 0.923
4416, 0, 0, 0, 0.997
4416, 0, 0, 1, 0.997
4416, 37, 0, 0, 1.001
4416, 37, 0, 1, 1.002
4416, 69, 0, 0, 1.004
4416, 69, 0, 1, 1.003
4416, 0, 37, 0, 0.928
4416, 0, 37, 1, 0.927
4416, 0, 69, 0, 0.762
4416, 0, 69, 1, 0.763
4416, 37, 37, 0, 0.994
4416, 37, 37, 1, 0.994
4416, 69, 69, 0, 0.959
4416, 69, 69, 1, 0.959
4416, 2048, 0, 0, 0.997
4416, 2048, 0, 1, 0.997
4416, 2085, 0, 0, 1.002
4416, 2085, 0, 1, 1.001
4416, 2048, 37, 0, 0.9
4416, 2048, 37, 1, 0.9
4416, 2085, 37, 0, 0.994
4416, 2085, 37, 1, 0.994
4416, 37, 1, 0, 1.024
4416, 37, 1, 1, 1.025
4416, 1, 37, 0, 0.922
4416, 1, 37, 1, 0.922
4416, 69, 1, 0, 1.008
4416, 69, 1, 1, 1.009
4416, 1, 69, 0, 0.913
4416, 1, 69, 1, 0.912
4416, 2085, 1, 0, 1.025
4416, 2085, 1, 1, 1.024
4416, 2049, 37, 0, 0.92
4416, 2049, 37, 1, 0.919
4480, 0, 0, 0, 1.0
4480, 0, 0, 1, 0.998
4480, 38, 0, 0, 0.996
4480, 38, 0, 1, 0.996
4480, 70, 0, 0, 0.992
4480, 70, 0, 1, 0.992
4480, 0, 38, 0, 0.919
4480, 0, 38, 1, 0.916
4480, 0, 70, 0, 0.767
4480, 0, 70, 1, 0.767
4480, 38, 38, 0, 1.002
4480, 38, 38, 1, 1.002
4480, 70, 70, 0, 0.963
4480, 70, 70, 1, 0.963
4480, 2048, 0, 0, 0.998
4480, 2048, 0, 1, 0.998
4480, 2086, 0, 0, 0.996
4480, 2086, 0, 1, 0.996
4480, 2048, 38, 0, 0.907
4480, 2048, 38, 1, 0.907
4480, 2086, 38, 0, 1.002
4480, 2086, 38, 1, 1.002
4480, 38, 1, 0, 1.023
4480, 38, 1, 1, 1.024
4480, 1, 38, 0, 0.914
4480, 1, 38, 1, 0.913
4480, 70, 1, 0, 1.01
4480, 70, 1, 1, 1.011
4480, 1, 70, 0, 0.922
4480, 1, 70, 1, 0.922
4480, 2086, 1, 0, 1.024
4480, 2086, 1, 1, 1.024
4480, 2049, 38, 0, 0.911
4480, 2049, 38, 1, 0.91
4544, 0, 0, 0, 1.002
4544, 0, 0, 1, 1.002
4544, 39, 0, 0, 1.007
4544, 39, 0, 1, 1.007
4544, 71, 0, 0, 1.01
4544, 71, 0, 1, 1.008
4544, 0, 39, 0, 0.93
4544, 0, 39, 1, 0.93
4544, 0, 71, 0, 0.766
4544, 0, 71, 1, 0.766
4544, 39, 39, 0, 1.001
4544, 39, 39, 1, 1.001
4544, 71, 71, 0, 0.966
4544, 71, 71, 1, 0.966
4544, 2048, 0, 0, 1.002
4544, 2048, 0, 1, 1.002
4544, 2087, 0, 0, 1.008
4544, 2087, 0, 1, 1.008
4544, 2048, 39, 0, 0.901
4544, 2048, 39, 1, 0.902
4544, 2087, 39, 0, 1.001
4544, 2087, 39, 1, 1.001
4544, 39, 1, 0, 1.032
4544, 39, 1, 1, 1.032
4544, 1, 39, 0, 0.925
4544, 1, 39, 1, 0.925
4544, 71, 1, 0, 0.997
4544, 71, 1, 1, 0.998
4544, 1, 71, 0, 0.921
4544, 1, 71, 1, 0.922
4544, 2087, 1, 0, 1.032
4544, 2087, 1, 1, 1.032
4544, 2049, 39, 0, 0.924
4544, 2049, 39, 1, 0.923
4608, 0, 0, 0, 0.999
4608, 0, 0, 1, 0.998
4608, 40, 0, 0, 1.013
4608, 40, 0, 1, 1.012
4608, 72, 0, 0, 1.013
4608, 72, 0, 1, 1.013
4608, 0, 40, 0, 0.925
4608, 0, 40, 1, 0.926
4608, 0, 72, 0, 0.765
4608, 0, 72, 1, 0.765
4608, 40, 40, 0, 1.085
4608, 40, 40, 1, 1.086
4608, 72, 72, 0, 0.966
4608, 72, 72, 1, 0.966
4608, 2048, 0, 0, 0.999
4608, 2048, 0, 1, 0.999
4608, 2088, 0, 0, 1.012
4608, 2088, 0, 1, 1.013
4608, 2048, 40, 0, 0.898
4608, 2048, 40, 1, 0.898
4608, 2088, 40, 0, 1.087
4608, 2088, 40, 1, 1.087
4608, 40, 1, 0, 1.006
4608, 40, 1, 1, 1.007
4608, 1, 40, 0, 0.919
4608, 1, 40, 1, 0.919
4608, 72, 1, 0, 1.012
4608, 72, 1, 1, 1.012
4608, 1, 72, 0, 0.914
4608, 1, 72, 1, 0.914
4608, 2088, 1, 0, 1.006
4608, 2088, 1, 1, 1.007
4608, 2049, 40, 0, 0.916
4608, 2049, 40, 1, 0.916
4672, 0, 0, 0, 1.014
4672, 0, 0, 1, 1.014
4672, 41, 0, 0, 1.002
4672, 41, 0, 1, 1.002
4672, 73, 0, 0, 0.976
4672, 73, 0, 1, 0.975
4672, 0, 41, 0, 0.919
4672, 0, 41, 1, 0.919
4672, 0, 73, 0, 0.772
4672, 0, 73, 1, 0.772
4672, 41, 41, 0, 1.012
4672, 41, 41, 1, 1.012
4672, 73, 73, 0, 0.973
4672, 73, 73, 1, 0.973
4672, 2048, 0, 0, 1.014
4672, 2048, 0, 1, 1.014
4672, 2089, 0, 0, 1.003
4672, 2089, 0, 1, 1.002
4672, 2048, 41, 0, 0.907
4672, 2048, 41, 1, 0.908
4672, 2089, 41, 0, 1.012
4672, 2089, 41, 1, 1.012
4672, 41, 1, 0, 1.02
4672, 41, 1, 1, 1.02
4672, 1, 41, 0, 0.916
4672, 1, 41, 1, 0.914
4672, 73, 1, 0, 1.024
4672, 73, 1, 1, 1.024
4672, 1, 73, 0, 0.927
4672, 1, 73, 1, 0.927
4672, 2089, 1, 0, 1.019
4672, 2089, 1, 1, 1.02
4672, 2049, 41, 0, 0.912
4672, 2049, 41, 1, 0.912
4736, 0, 0, 0, 1.007
4736, 0, 0, 1, 1.006
4736, 42, 0, 0, 1.012
4736, 42, 0, 1, 1.013
4736, 74, 0, 0, 0.976
4736, 74, 0, 1, 0.975
4736, 0, 42, 0, 0.93
4736, 0, 42, 1, 0.931
4736, 0, 74, 0, 0.769
4736, 0, 74, 1, 0.77
4736, 42, 42, 0, 1.007
4736, 42, 42, 1, 1.007
4736, 74, 74, 0, 0.965
4736, 74, 74, 1, 0.965
4736, 2048, 0, 0, 1.006
4736, 2048, 0, 1, 1.007
4736, 2090, 0, 0, 1.012
4736, 2090, 0, 1, 1.013
4736, 2048, 42, 0, 0.902
4736, 2048, 42, 1, 0.901
4736, 2090, 42, 0, 1.007
4736, 2090, 42, 1, 1.007
4736, 42, 1, 0, 1.032
4736, 42, 1, 1, 1.032
4736, 1, 42, 0, 0.919
4736, 1, 42, 1, 0.919
4736, 74, 1, 0, 1.017
4736, 74, 1, 1, 1.018
4736, 1, 74, 0, 0.919
4736, 1, 74, 1, 0.918
4736, 2090, 1, 0, 1.031
4736, 2090, 1, 1, 1.031
4736, 2049, 42, 0, 0.916
4736, 2049, 42, 1, 0.916
4800, 0, 0, 0, 1.012
4800, 0, 0, 1, 1.012
4800, 43, 0, 0, 1.008
4800, 43, 0, 1, 1.009
4800, 75, 0, 0, 0.99
4800, 75, 0, 1, 0.99
4800, 0, 43, 0, 0.929
4800, 0, 43, 1, 0.927
4800, 0, 75, 0, 0.768
4800, 0, 75, 1, 0.768
4800, 43, 43, 0, 1.004
4800, 43, 43, 1, 1.004
4800, 75, 75, 0, 0.965
4800, 75, 75, 1, 0.965
4800, 2048, 0, 0, 1.012
4800, 2048, 0, 1, 1.012
4800, 2091, 0, 0, 1.009
4800, 2091, 0, 1, 1.008
4800, 2048, 43, 0, 0.901
4800, 2048, 43, 1, 0.901
4800, 2091, 43, 0, 1.004
4800, 2091, 43, 1, 1.004
4800, 43, 1, 0, 1.026
4800, 43, 1, 1, 1.026
4800, 1, 43, 0, 0.923
4800, 1, 43, 1, 0.922
4800, 75, 1, 0, 0.993
4800, 75, 1, 1, 0.991
4800, 1, 75, 0, 0.921
4800, 1, 75, 1, 0.92
4800, 2091, 1, 0, 1.026
4800, 2091, 1, 1, 1.026
4800, 2049, 43, 0, 0.92
4800, 2049, 43, 1, 0.919
4864, 0, 0, 0, 0.999
4864, 0, 0, 1, 0.999
4864, 44, 0, 0, 0.998
4864, 44, 0, 1, 0.998
4864, 76, 0, 0, 0.981
4864, 76, 0, 1, 0.981
4864, 0, 44, 0, 0.916
4864, 0, 44, 1, 0.918
4864, 0, 76, 0, 0.772
4864, 0, 76, 1, 0.771
4864, 44, 44, 0, 1.006
4864, 44, 44, 1, 1.005
4864, 76, 76, 0, 0.97
4864, 76, 76, 1, 0.97
4864, 2048, 0, 0, 0.999
4864, 2048, 0, 1, 0.999
4864, 2092, 0, 0, 0.997
4864, 2092, 0, 1, 0.997
4864, 2048, 44, 0, 0.908
4864, 2048, 44, 1, 0.907
4864, 2092, 44, 0, 1.005
4864, 2092, 44, 1, 1.005
4864, 44, 1, 0, 0.893
4864, 44, 1, 1, 0.893
4864, 1, 44, 0, 0.922
4864, 1, 44, 1, 0.921
4864, 76, 1, 0, 0.866
4864, 76, 1, 1, 0.866
4864, 1, 76, 0, 0.919
4864, 1, 76, 1, 0.919
4864, 2092, 1, 0, 0.893
4864, 2092, 1, 1, 0.893
4864, 2049, 44, 0, 0.919
4864, 2049, 44, 1, 0.919
4928, 0, 0, 0, 1.005
4928, 0, 0, 1, 1.005
4928, 45, 0, 0, 1.005
4928, 45, 0, 1, 1.005
4928, 77, 0, 0, 0.97
4928, 77, 0, 1, 0.97
4928, 0, 45, 0, 0.931
4928, 0, 45, 1, 0.932
4928, 0, 77, 0, 0.771
4928, 0, 77, 1, 0.771
4928, 45, 45, 0, 1.0
4928, 45, 45, 1, 1.0
4928, 77, 77, 0, 0.972
4928, 77, 77, 1, 0.972
4928, 2048, 0, 0, 1.005
4928, 2048, 0, 1, 1.005
4928, 2093, 0, 0, 1.005
4928, 2093, 0, 1, 1.005
4928, 2048, 45, 0, 0.904
4928, 2048, 45, 1, 0.905
4928, 2093, 45, 0, 1.0
4928, 2093, 45, 1, 1.0
4928, 45, 1, 0, 1.024
4928, 45, 1, 1, 1.024
4928, 1, 45, 0, 0.913
4928, 1, 45, 1, 0.912
4928, 77, 1, 0, 0.996
4928, 77, 1, 1, 0.996
4928, 1, 77, 0, 0.925
4928, 1, 77, 1, 0.925
4928, 2093, 1, 0, 1.025
4928, 2093, 1, 1, 1.024
4928, 2049, 45, 0, 0.916
4928, 2049, 45, 1, 0.911
4992, 0, 0, 0, 1.0
4992, 0, 0, 1, 1.0
4992, 46, 0, 0, 1.009
4992, 46, 0, 1, 1.009
4992, 78, 0, 0, 0.992
4992, 78, 0, 1, 0.992
4992, 0, 46, 0, 0.908
4992, 0, 46, 1, 0.908
4992, 0, 78, 0, 0.751
4992, 0, 78, 1, 0.752
4992, 46, 46, 0, 0.997
4992, 46, 46, 1, 0.997
4992, 78, 78, 0, 0.968
4992, 78, 78, 1, 0.969
4992, 2048, 0, 0, 1.0
4992, 2048, 0, 1, 1.001
4992, 2094, 0, 0, 1.008
4992, 2094, 0, 1, 1.009
4992, 2048, 46, 0, 0.883
4992, 2048, 46, 1, 0.883
4992, 2094, 46, 0, 0.997
4992, 2094, 46, 1, 0.997
4992, 46, 1, 0, 1.025
4992, 46, 1, 1, 1.025
4992, 1, 46, 0, 0.923
4992, 1, 46, 1, 0.923
4992, 78, 1, 0, 1.0
4992, 78, 1, 1, 1.001
4992, 1, 78, 0, 0.92
4992, 1, 78, 1, 0.92
4992, 2094, 1, 0, 1.025
4992, 2094, 1, 1, 1.026
4992, 2049, 46, 0, 0.92
4992, 2049, 46, 1, 0.921
5056, 0, 0, 0, 1.002
5056, 0, 0, 1, 1.001
5056, 47, 0, 0, 1.006
5056, 47, 0, 1, 1.006
5056, 79, 0, 0, 0.99
5056, 79, 0, 1, 0.988
5056, 0, 47, 0, 0.917
5056, 0, 47, 1, 0.916
5056, 0, 79, 0, 0.771
5056, 0, 79, 1, 0.772
5056, 47, 47, 0, 1.006
5056, 47, 47, 1, 1.006
5056, 79, 79, 0, 0.972
5056, 79, 79, 1, 0.973
5056, 2048, 0, 0, 1.003
5056, 2048, 0, 1, 1.001
5056, 2095, 0, 0, 1.005
5056, 2095, 0, 1, 1.004
5056, 2048, 47, 0, 0.908
5056, 2048, 47, 1, 0.909
5056, 2095, 47, 0, 1.006
5056, 2095, 47, 1, 1.006
5056, 47, 1, 0, 1.032
5056, 47, 1, 1, 1.034
5056, 1, 47, 0, 0.926
5056, 1, 47, 1, 0.926
5056, 79, 1, 0, 1.003
5056, 79, 1, 1, 1.004
5056, 1, 79, 0, 0.927
5056, 1, 79, 1, 0.927
5056, 2095, 1, 0, 1.034
5056, 2095, 1, 1, 1.033
5056, 2049, 47, 0, 0.924
5056, 2049, 47, 1, 0.923
5120, 0, 0, 0, 1.003
5120, 0, 0, 1, 1.004
5120, 48, 0, 0, 1.068
5120, 48, 0, 1, 1.068
5120, 80, 0, 0, 1.068
5120, 80, 0, 1, 1.068
5120, 0, 48, 0, 1.065
5120, 0, 48, 1, 1.064
5120, 0, 80, 0, 1.065
5120, 0, 80, 1, 1.065
5120, 48, 48, 0, 1.004
5120, 48, 48, 1, 1.005
5120, 80, 80, 0, 1.005
5120, 80, 80, 1, 1.005
5120, 2048, 0, 0, 1.005
5120, 2048, 0, 1, 1.005
5120, 2096, 0, 0, 1.068
5120, 2096, 0, 1, 1.068
5120, 2048, 48, 0, 1.066
5120, 2048, 48, 1, 1.065
5120, 2096, 48, 0, 1.005
5120, 2096, 48, 1, 1.005
5120, 48, 1, 0, 1.032
5120, 48, 1, 1, 1.032
5120, 1, 48, 0, 0.899
5120, 1, 48, 1, 0.899
5120, 80, 1, 0, 0.844
5120, 80, 1, 1, 0.843
5120, 1, 80, 0, 0.892
5120, 1, 80, 1, 0.892
5120, 2096, 1, 0, 0.856
5120, 2096, 1, 1, 0.856
5120, 2049, 48, 0, 0.898
5120, 2049, 48, 1, 0.898
Results For: bench-memcpy-large
length, align1, align2, dst > src, New Time / Old Time
65543, 0, 0, 0, 0.977
65543, 0, 0, 1, 0.976
65551, 0, 3, 0, 1.01
65551, 0, 3, 1, 1.011
65567, 3, 0, 0, 1.02
65567, 3, 0, 1, 1.02
65599, 3, 5, 0, 1.056
65599, 3, 5, 1, 1.057
65536, 0, 127, 0, 1.043
65536, 0, 127, 1, 1.043
65536, 0, 255, 0, 1.07
65536, 0, 255, 1, 1.071
65536, 0, 256, 0, 0.978
65536, 0, 256, 1, 0.979
65536, 0, 4064, 0, 1.017
65536, 0, 4064, 1, 1.018
131079, 0, 0, 0, 0.979
131079, 0, 0, 1, 0.979
131087, 0, 3, 0, 1.016
131087, 0, 3, 1, 1.016
131103, 3, 0, 0, 1.022
131103, 3, 0, 1, 1.022
131135, 3, 5, 0, 1.063
131135, 3, 5, 1, 1.063
131072, 0, 127, 0, 1.048
131072, 0, 127, 1, 1.048
131072, 0, 255, 0, 1.074
131072, 0, 255, 1, 1.074
131072, 0, 256, 0, 0.982
131072, 0, 256, 1, 0.982
131072, 0, 4064, 0, 1.018
131072, 0, 4064, 1, 1.019
262151, 0, 0, 0, 0.984
262151, 0, 0, 1, 0.984
262159, 0, 3, 0, 1.024
262159, 0, 3, 1, 1.024
262175, 3, 0, 0, 1.03
262175, 3, 0, 1, 1.03
262207, 3, 5, 0, 1.068
262207, 3, 5, 1, 1.069
262144, 0, 127, 0, 1.056
262144, 0, 127, 1, 1.056
262144, 0, 255, 0, 1.078
262144, 0, 255, 1, 1.078
262144, 0, 256, 0, 0.986
262144, 0, 256, 1, 0.986
262144, 0, 4064, 0, 1.02
262144, 0, 4064, 1, 1.02
524295, 0, 0, 0, 0.692
524295, 0, 0, 1, 0.692
524303, 0, 3, 0, 0.736
524303, 0, 3, 1, 0.736
524319, 3, 0, 0, 0.759
524319, 3, 0, 1, 0.759
524351, 3, 5, 0, 0.758
524351, 3, 5, 1, 0.759
524288, 0, 127, 0, 1.057
524288, 0, 127, 1, 1.057
524288, 0, 255, 0, 1.079
524288, 0, 255, 1, 1.079
524288, 0, 256, 0, 0.987
524288, 0, 256, 1, 0.987
524288, 0, 4064, 0, 1.02
524288, 0, 4064, 1, 1.02
1048583, 0, 0, 0, 0.948
1048583, 0, 0, 1, 0.949
1048591, 0, 3, 0, 0.734
1048591, 0, 3, 1, 0.735
1048607, 3, 0, 0, 0.758
1048607, 3, 0, 1, 0.757
1048639, 3, 5, 0, 0.757
1048639, 3, 5, 1, 0.757
1048576, 0, 127, 0, 0.761
1048576, 0, 127, 1, 0.763
1048576, 0, 255, 0, 0.751
1048576, 0, 255, 1, 0.751
1048576, 0, 256, 0, 0.93
1048576, 0, 256, 1, 0.93
1048576, 0, 4064, 0, 0.93
1048576, 0, 4064, 1, 0.93
2097159, 0, 0, 0, 0.928
2097159, 0, 0, 1, 0.931
2097167, 0, 3, 0, 0.735
2097167, 0, 3, 1, 0.734
2097183, 3, 0, 0, 0.759
2097183, 3, 0, 1, 0.76
2097215, 3, 5, 0, 0.758
2097215, 3, 5, 1, 0.757
2097152, 0, 127, 0, 0.77
2097152, 0, 127, 1, 0.77
2097152, 0, 255, 0, 0.745
2097152, 0, 255, 1, 0.745
2097152, 0, 256, 0, 0.924
2097152, 0, 256, 1, 0.925
2097152, 0, 4064, 0, 0.926
2097152, 0, 4064, 1, 0.927
4194311, 0, 0, 0, 0.886
4194311, 0, 0, 1, 0.89
4194319, 0, 3, 0, 0.746
4194319, 0, 3, 1, 0.745
4194335, 3, 0, 0, 0.816
4194335, 3, 0, 1, 0.816
4194367, 3, 5, 0, 0.78
4194367, 3, 5, 1, 0.781
4194304, 0, 127, 0, 0.792
4194304, 0, 127, 1, 0.791
4194304, 0, 255, 0, 0.803
4194304, 0, 255, 1, 0.799
4194304, 0, 256, 0, 0.865
4194304, 0, 256, 1, 0.863
4194304, 0, 4064, 0, 0.953
4194304, 0, 4064, 1, 0.95
8388615, 0, 0, 0, 0.876
8388615, 0, 0, 1, 0.877
8388623, 0, 3, 0, 0.762
8388623, 0, 3, 1, 0.762
8388639, 3, 0, 0, 0.871
8388639, 3, 0, 1, 0.87
8388671, 3, 5, 0, 0.805
8388671, 3, 5, 1, 0.808
8388608, 0, 127, 0, 0.824
8388608, 0, 127, 1, 0.823
8388608, 0, 255, 0, 0.858
8388608, 0, 255, 1, 0.857
8388608, 0, 256, 0, 0.843
8388608, 0, 256, 1, 0.84
8388608, 0, 4064, 0, 0.981
8388608, 0, 4064, 1, 0.981
16777223, 0, 0, 0, 0.881
16777223, 0, 0, 1, 0.882
16777231, 0, 3, 0, 0.765
16777231, 0, 3, 1, 0.765
16777247, 3, 0, 0, 0.87
16777247, 3, 0, 1, 0.87
16777279, 3, 5, 0, 0.807
16777279, 3, 5, 1, 0.811
16777216, 0, 127, 0, 0.827
16777216, 0, 127, 1, 0.827
16777216, 0, 255, 0, 0.858
16777216, 0, 255, 1, 0.857
16777216, 0, 256, 0, 0.848
16777216, 0, 256, 1, 0.844
16777216, 0, 4064, 0, 0.98
16777216, 0, 4064, 1, 0.981
33554439, 0, 0, 0, 0.883
33554439, 0, 0, 1, 0.884
33554447, 0, 3, 0, 0.767
33554447, 0, 3, 1, 0.766
33554463, 3, 0, 0, 0.87
33554463, 3, 0, 1, 0.87
33554495, 3, 5, 0, 0.809
33554495, 3, 5, 1, 0.813
33554432, 0, 127, 0, 0.829
33554432, 0, 127, 1, 0.829
33554432, 0, 255, 0, 0.857
33554432, 0, 255, 1, 0.857
33554432, 0, 256, 0, 0.85
33554432, 0, 256, 1, 0.846
33554432, 0, 4064, 0, 0.981
33554432, 0, 4064, 1, 0.981
Results For: bench-memcpy-random
length, New Time / Old Time
32768, 0.888
65536, 0.906
131072, 0.915
262144, 0.919
524288, 0.921
1048576, 0.929
sysdeps/x86_64/multiarch/Makefile | 1 -
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 3151 ----------------------
sysdeps/x86_64/multiarch/memmove-ssse3.S | 384 ++-
3 files changed, 380 insertions(+), 3156 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
memcmpeq-avx2-rtm \
memcmpeq-evex \
memcmpeq-sse2 \
- memcpy-ssse3 \
memmove-avx-unaligned-erms \
memmove-avx-unaligned-erms-rtm \
memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
- Copyright (C) 2010-2022 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY __memcpy_ssse3
-# define MEMCPY_CHK __memcpy_chk_ssse3
-# define MEMPCPY __mempcpy_ssse3
-# define MEMPCPY_CHK __mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B) I - B
-
-/* Branch to an entry in a jump table. TABLE is a jump table with
- relative offsets. INDEX is a register contains the index into the
- jump table. SCALE is the scale of INDEX. */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
- lea TABLE(%rip), %r11; \
- movslq (%r11, INDEX, SCALE), INDEX; \
- lea (%r11, INDEX), INDEX; \
- _CET_NOTRACK jmp *INDEX; \
- ud2
-
- .section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
- mov %RDI_LP, %RAX_LP
- add %RDX_LP, %RAX_LP
- jmp L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
- cmp %RDX_LP, %RCX_LP
- jb HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
- mov %RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
- add %RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
- cmp %rsi, %rdi
- jb L(copy_forward)
- je L(write_0bytes)
- cmp $79, %rdx
- jbe L(copy_forward)
- jmp L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
- cmp $79, %rdx
- lea L(table_less_80bytes)(%rip), %r11
- ja L(80bytesormore)
- movslq (%r11, %rdx, 4), %r9
- add %rdx, %rsi
- add %rdx, %rdi
- add %r11, %r9
- _CET_NOTRACK jmp *%r9
- ud2
-
- .p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
- cmp %dil, %sil
- jle L(copy_backward)
-#endif
-
- movdqu (%rsi), %xmm0
- mov %rdi, %rcx
- and $-16, %rdi
- add $16, %rdi
- mov %rcx, %r8
- sub %rdi, %rcx
- add %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_fwd)
- and $0xf, %r9
- jz L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
- .p2align 4
-L(copy_backward):
- movdqu -16(%rsi, %rdx), %xmm0
- add %rdx, %rsi
- lea -16(%rdi, %rdx), %r8
- add %rdx, %rdi
-
- mov %rdi, %rcx
- and $0xf, %rcx
- xor %rcx, %rdi
- sub %rcx, %rdx
- sub %rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
- mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
- cmp %rcx, %rdx
- mov %rsi, %r9
- ja L(large_page_bwd)
- and $0xf, %r9
- jz L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
- mov __x86_data_cache_size_half(%rip), %RCX_LP
-#endif
- BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
- .p2align 4
-L(shl_0):
- sub $16, %rdx
- movdqa (%rsi), %xmm1
- add $16, %rsi
- movdqa %xmm1, (%rdi)
- add $16, %rdi
- cmp $128, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes)
- movaps (%rsi), %xmm4
- movaps 16(%rsi), %xmm1
- movaps 32(%rsi), %xmm2
- movaps 48(%rsi), %xmm3
- movaps %xmm4, (%rdi)
- movaps %xmm1, 16(%rdi)
- movaps %xmm2, 32(%rdi)
- movaps %xmm3, 48(%rdi)
- sub $64, %rdx
- add $64, %rsi
- add $64, %rdi
-L(shl_0_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
- movdqa (%rsi), %xmm4
- movaps 0x10(%rsi), %xmm1
- movaps 0x20(%rsi), %xmm2
- movaps 0x30(%rsi), %xmm3
-
- movdqa %xmm4, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
-
- sub $128, %rdx
- movaps 0x40(%rsi), %xmm4
- movaps 0x50(%rsi), %xmm5
- movaps 0x60(%rsi), %xmm6
- movaps 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_cache_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_cache_less_64bytes)
-
- movdqa (%rsi), %xmm4
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm4, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm4
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm4, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_cache_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_loop):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x280(%rsi)
-
- movdqa (%rsi), %xmm0
- movdqa 0x10(%rsi), %xmm1
- movdqa 0x20(%rsi), %xmm2
- movdqa 0x30(%rsi), %xmm3
- movdqa 0x40(%rsi), %xmm4
- movdqa 0x50(%rsi), %xmm5
- movdqa 0x60(%rsi), %xmm6
- movdqa 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- movdqa %xmm2, 0x20(%rdi)
- movdqa %xmm3, 0x30(%rdi)
- movdqa %xmm4, 0x40(%rdi)
- movdqa %xmm5, 0x50(%rdi)
- movdqa %xmm6, 0x60(%rdi)
- movdqa %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_less_64bytes)
-
- movdqa (%rsi), %xmm0
- sub $0x40, %rdx
- movdqa 0x10(%rsi), %xmm1
-
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
-
- movdqa 0x20(%rsi), %xmm0
- movdqa 0x30(%rsi), %xmm1
- add $0x40, %rsi
-
- movdqa %xmm0, 0x20(%rdi)
- movdqa %xmm1, 0x30(%rdi)
- add $0x40, %rdi
-L(shl_0_mem_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_less_32bytes)
- movdqa (%rsi), %xmm0
- sub $0x20, %rdx
- movdqa 0x10(%rsi), %xmm1
- add $0x20, %rsi
- movdqa %xmm0, (%rdi)
- movdqa %xmm1, 0x10(%rdi)
- add $0x20, %rdi
-L(shl_0_mem_less_32bytes):
- add %rdx, %rdi
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_bwd):
- sub $16, %rdx
- movdqa -0x10(%rsi), %xmm1
- sub $16, %rsi
- movdqa %xmm1, -0x10(%rdi)
- sub $16, %rdi
- cmp $0x80, %rdx
- movdqu %xmm0, (%r8)
- ja L(shl_0_gobble_bwd)
- cmp $64, %rdx
- jb L(shl_0_less_64bytes_bwd)
- movaps -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- sub $64, %rdx
- sub $0x40, %rsi
- sub $0x40, %rdi
-L(shl_0_less_64bytes_bwd):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
- cmp $DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
- cmp __x86_data_cache_size_half(%rip), %RDX_LP
-#endif
- lea -128(%rdx), %rdx
- jae L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
- movdqa -0x10(%rsi), %xmm0
- movaps -0x20(%rsi), %xmm1
- movaps -0x30(%rsi), %xmm2
- movaps -0x40(%rsi), %xmm3
-
- movdqa %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
-
- sub $0x80, %rdx
- movaps -0x50(%rsi), %xmm4
- movaps -0x60(%rsi), %xmm5
- movaps -0x70(%rsi), %xmm6
- movaps -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_gobble_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_0_gobble_mem_bwd_loop):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x280(%rsi)
- movdqa -0x10(%rsi), %xmm0
- movdqa -0x20(%rsi), %xmm1
- movdqa -0x30(%rsi), %xmm2
- movdqa -0x40(%rsi), %xmm3
- movdqa -0x50(%rsi), %xmm4
- movdqa -0x60(%rsi), %xmm5
- movdqa -0x70(%rsi), %xmm6
- movdqa -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
- sub $0x80, %rdx
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- movdqa %xmm2, -0x30(%rdi)
- movdqa %xmm3, -0x40(%rdi)
- movdqa %xmm4, -0x50(%rdi)
- movdqa %xmm5, -0x60(%rdi)
- movdqa %xmm6, -0x70(%rdi)
- movdqa %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
-
- jae L(shl_0_gobble_mem_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(shl_0_mem_bwd_less_64bytes)
-
- movdqa -0x10(%rsi), %xmm0
- sub $0x40, %rdx
- movdqa -0x20(%rsi), %xmm1
-
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
-
- movdqa -0x30(%rsi), %xmm0
- movdqa -0x40(%rsi), %xmm1
- sub $0x40, %rsi
-
- movdqa %xmm0, -0x30(%rdi)
- movdqa %xmm1, -0x40(%rdi)
- sub $0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
- cmp $0x20, %rdx
- jb L(shl_0_mem_bwd_less_32bytes)
- movdqa -0x10(%rsi), %xmm0
- sub $0x20, %rdx
- movdqa -0x20(%rsi), %xmm1
- sub $0x20, %rsi
- movdqa %xmm0, -0x10(%rdi)
- movdqa %xmm1, -0x20(%rdi)
- sub $0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1):
- lea (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_fwd)
- lea (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
- sub $64, %rdx
- movaps 0x0f(%rsi), %xmm2
- movaps 0x1f(%rsi), %xmm3
- movaps 0x2f(%rsi), %xmm4
- movaps 0x3f(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $1, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $1, %xmm3, %xmm4
- palignr $1, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $1, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_1_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_1_bwd):
- lea (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x01(%rsi), %xmm1
- jb L(L1_bwd)
- lea (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
- movaps -0x11(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x21(%rsi), %xmm3
- movaps -0x31(%rsi), %xmm4
- movaps -0x41(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $1, %xmm2, %xmm1
- palignr $1, %xmm3, %xmm2
- palignr $1, %xmm4, %xmm3
- palignr $1, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_1_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_1_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2):
- lea (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_fwd)
- lea (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
- sub $64, %rdx
- movaps 0x0e(%rsi), %xmm2
- movaps 0x1e(%rsi), %xmm3
- movaps 0x2e(%rsi), %xmm4
- movaps 0x3e(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $2, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $2, %xmm3, %xmm4
- palignr $2, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $2, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_2_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_2_bwd):
- lea (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x02(%rsi), %xmm1
- jb L(L2_bwd)
- lea (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
- movaps -0x12(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x22(%rsi), %xmm3
- movaps -0x32(%rsi), %xmm4
- movaps -0x42(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $2, %xmm2, %xmm1
- palignr $2, %xmm3, %xmm2
- palignr $2, %xmm4, %xmm3
- palignr $2, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_2_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_2_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3):
- lea (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_fwd)
- lea (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
- sub $64, %rdx
- movaps 0x0d(%rsi), %xmm2
- movaps 0x1d(%rsi), %xmm3
- movaps 0x2d(%rsi), %xmm4
- movaps 0x3d(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $3, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $3, %xmm3, %xmm4
- palignr $3, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $3, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_3_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_3_bwd):
- lea (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x03(%rsi), %xmm1
- jb L(L3_bwd)
- lea (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
- movaps -0x13(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x23(%rsi), %xmm3
- movaps -0x33(%rsi), %xmm4
- movaps -0x43(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $3, %xmm2, %xmm1
- palignr $3, %xmm3, %xmm2
- palignr $3, %xmm4, %xmm3
- palignr $3, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_3_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_3_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4):
- lea (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_fwd)
- lea (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
- sub $64, %rdx
- movaps 0x0c(%rsi), %xmm2
- movaps 0x1c(%rsi), %xmm3
- movaps 0x2c(%rsi), %xmm4
- movaps 0x3c(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $4, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $4, %xmm3, %xmm4
- palignr $4, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $4, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_4_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_4_bwd):
- lea (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x04(%rsi), %xmm1
- jb L(L4_bwd)
- lea (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
- movaps -0x14(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x24(%rsi), %xmm3
- movaps -0x34(%rsi), %xmm4
- movaps -0x44(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $4, %xmm2, %xmm1
- palignr $4, %xmm3, %xmm2
- palignr $4, %xmm4, %xmm3
- palignr $4, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_4_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_4_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5):
- lea (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_fwd)
- lea (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
- sub $64, %rdx
- movaps 0x0b(%rsi), %xmm2
- movaps 0x1b(%rsi), %xmm3
- movaps 0x2b(%rsi), %xmm4
- movaps 0x3b(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $5, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $5, %xmm3, %xmm4
- palignr $5, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $5, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_5_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_5_bwd):
- lea (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x05(%rsi), %xmm1
- jb L(L5_bwd)
- lea (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
- movaps -0x15(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x25(%rsi), %xmm3
- movaps -0x35(%rsi), %xmm4
- movaps -0x45(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $5, %xmm2, %xmm1
- palignr $5, %xmm3, %xmm2
- palignr $5, %xmm4, %xmm3
- palignr $5, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_5_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_5_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6):
- lea (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_fwd)
- lea (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
- sub $64, %rdx
- movaps 0x0a(%rsi), %xmm2
- movaps 0x1a(%rsi), %xmm3
- movaps 0x2a(%rsi), %xmm4
- movaps 0x3a(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $6, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $6, %xmm3, %xmm4
- palignr $6, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $6, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_6_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_6_bwd):
- lea (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x06(%rsi), %xmm1
- jb L(L6_bwd)
- lea (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
- movaps -0x16(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x26(%rsi), %xmm3
- movaps -0x36(%rsi), %xmm4
- movaps -0x46(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $6, %xmm2, %xmm1
- palignr $6, %xmm3, %xmm2
- palignr $6, %xmm4, %xmm3
- palignr $6, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_6_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_6_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7):
- lea (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_fwd)
- lea (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
- sub $64, %rdx
- movaps 0x09(%rsi), %xmm2
- movaps 0x19(%rsi), %xmm3
- movaps 0x29(%rsi), %xmm4
- movaps 0x39(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $7, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $7, %xmm3, %xmm4
- palignr $7, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $7, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_7_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_7_bwd):
- lea (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x07(%rsi), %xmm1
- jb L(L7_bwd)
- lea (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
- movaps -0x17(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x27(%rsi), %xmm3
- movaps -0x37(%rsi), %xmm4
- movaps -0x47(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $7, %xmm2, %xmm1
- palignr $7, %xmm3, %xmm2
- palignr $7, %xmm4, %xmm3
- palignr $7, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_7_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_7_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8):
- lea (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_fwd)
- lea (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
- sub $64, %rdx
- movaps 0x08(%rsi), %xmm2
- movaps 0x18(%rsi), %xmm3
- movaps 0x28(%rsi), %xmm4
- movaps 0x38(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $8, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $8, %xmm3, %xmm4
- palignr $8, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $8, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_8_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
- .p2align 4
-L(shl_8_end):
- lea 64(%rdx), %rdx
- movaps %xmm4, -0x20(%rdi)
- add %rdx, %rsi
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_8_bwd):
- lea (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x08(%rsi), %xmm1
- jb L(L8_bwd)
- lea (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
- movaps -0x18(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x28(%rsi), %xmm3
- movaps -0x38(%rsi), %xmm4
- movaps -0x48(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $8, %xmm2, %xmm1
- palignr $8, %xmm3, %xmm2
- palignr $8, %xmm4, %xmm3
- palignr $8, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_8_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_8_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9):
- lea (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_fwd)
- lea (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
- sub $64, %rdx
- movaps 0x07(%rsi), %xmm2
- movaps 0x17(%rsi), %xmm3
- movaps 0x27(%rsi), %xmm4
- movaps 0x37(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $9, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $9, %xmm3, %xmm4
- palignr $9, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $9, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_9_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_9_bwd):
- lea (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x09(%rsi), %xmm1
- jb L(L9_bwd)
- lea (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
- movaps -0x19(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x29(%rsi), %xmm3
- movaps -0x39(%rsi), %xmm4
- movaps -0x49(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $9, %xmm2, %xmm1
- palignr $9, %xmm3, %xmm2
- palignr $9, %xmm4, %xmm3
- palignr $9, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_9_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_9_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10):
- lea (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_fwd)
- lea (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
- sub $64, %rdx
- movaps 0x06(%rsi), %xmm2
- movaps 0x16(%rsi), %xmm3
- movaps 0x26(%rsi), %xmm4
- movaps 0x36(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $10, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $10, %xmm3, %xmm4
- palignr $10, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $10, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_10_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_10_bwd):
- lea (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0a(%rsi), %xmm1
- jb L(L10_bwd)
- lea (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
- movaps -0x1a(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2a(%rsi), %xmm3
- movaps -0x3a(%rsi), %xmm4
- movaps -0x4a(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $10, %xmm2, %xmm1
- palignr $10, %xmm3, %xmm2
- palignr $10, %xmm4, %xmm3
- palignr $10, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_10_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_10_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11):
- lea (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_fwd)
- lea (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
- sub $64, %rdx
- movaps 0x05(%rsi), %xmm2
- movaps 0x15(%rsi), %xmm3
- movaps 0x25(%rsi), %xmm4
- movaps 0x35(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $11, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $11, %xmm3, %xmm4
- palignr $11, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $11, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_11_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_11_bwd):
- lea (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0b(%rsi), %xmm1
- jb L(L11_bwd)
- lea (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
- movaps -0x1b(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2b(%rsi), %xmm3
- movaps -0x3b(%rsi), %xmm4
- movaps -0x4b(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $11, %xmm2, %xmm1
- palignr $11, %xmm3, %xmm2
- palignr $11, %xmm4, %xmm3
- palignr $11, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_11_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_11_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12):
- lea (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_fwd)
- lea (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
- sub $64, %rdx
- movaps 0x04(%rsi), %xmm2
- movaps 0x14(%rsi), %xmm3
- movaps 0x24(%rsi), %xmm4
- movaps 0x34(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $12, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $12, %xmm3, %xmm4
- palignr $12, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $12, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_12_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_12_bwd):
- lea (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0c(%rsi), %xmm1
- jb L(L12_bwd)
- lea (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
- movaps -0x1c(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2c(%rsi), %xmm3
- movaps -0x3c(%rsi), %xmm4
- movaps -0x4c(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $12, %xmm2, %xmm1
- palignr $12, %xmm3, %xmm2
- palignr $12, %xmm4, %xmm3
- palignr $12, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_12_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_12_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13):
- lea (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_fwd)
- lea (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
- sub $64, %rdx
- movaps 0x03(%rsi), %xmm2
- movaps 0x13(%rsi), %xmm3
- movaps 0x23(%rsi), %xmm4
- movaps 0x33(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $13, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $13, %xmm3, %xmm4
- palignr $13, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $13, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_13_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_13_bwd):
- lea (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0d(%rsi), %xmm1
- jb L(L13_bwd)
- lea (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
- movaps -0x1d(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2d(%rsi), %xmm3
- movaps -0x3d(%rsi), %xmm4
- movaps -0x4d(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $13, %xmm2, %xmm1
- palignr $13, %xmm3, %xmm2
- palignr $13, %xmm4, %xmm3
- palignr $13, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_13_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_13_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14):
- lea (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_fwd)
- lea (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
- sub $64, %rdx
- movaps 0x02(%rsi), %xmm2
- movaps 0x12(%rsi), %xmm3
- movaps 0x22(%rsi), %xmm4
- movaps 0x32(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $14, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $14, %xmm3, %xmm4
- palignr $14, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $14, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_14_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_14_bwd):
- lea (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0e(%rsi), %xmm1
- jb L(L14_bwd)
- lea (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
- movaps -0x1e(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2e(%rsi), %xmm3
- movaps -0x3e(%rsi), %xmm4
- movaps -0x4e(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $14, %xmm2, %xmm1
- palignr $14, %xmm3, %xmm2
- palignr $14, %xmm4, %xmm3
- palignr $14, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_14_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_14_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15):
- lea (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_fwd)
- lea (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_loop_L2):
- prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
- sub $64, %rdx
- movaps 0x01(%rsi), %xmm2
- movaps 0x11(%rsi), %xmm3
- movaps 0x21(%rsi), %xmm4
- movaps 0x31(%rsi), %xmm5
- movdqa %xmm5, %xmm6
- palignr $15, %xmm4, %xmm5
- lea 64(%rsi), %rsi
- palignr $15, %xmm3, %xmm4
- palignr $15, %xmm2, %xmm3
- lea 64(%rdi), %rdi
- palignr $15, %xmm1, %xmm2
- movdqa %xmm6, %xmm1
- movdqa %xmm2, -0x40(%rdi)
- movaps %xmm3, -0x30(%rdi)
- jb L(shl_15_end)
- movaps %xmm4, -0x20(%rdi)
- movaps %xmm5, -0x10(%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_end):
- movaps %xmm4, -0x20(%rdi)
- lea 64(%rdx), %rdx
- movaps %xmm5, -0x10(%rdi)
- add %rdx, %rdi
- movdqu %xmm0, (%r8)
- add %rdx, %rsi
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(shl_15_bwd):
- lea (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
- cmp %rcx, %rdx
- movaps -0x0f(%rsi), %xmm1
- jb L(L15_bwd)
- lea (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
- lea -64(%rdx), %rdx
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_loop_L2):
- prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
- movaps -0x1f(%rsi), %xmm2
- sub $0x40, %rdx
- movaps -0x2f(%rsi), %xmm3
- movaps -0x3f(%rsi), %xmm4
- movaps -0x4f(%rsi), %xmm5
- lea -0x40(%rsi), %rsi
- palignr $15, %xmm2, %xmm1
- palignr $15, %xmm3, %xmm2
- palignr $15, %xmm4, %xmm3
- palignr $15, %xmm5, %xmm4
-
- movaps %xmm1, -0x10(%rdi)
- movaps %xmm5, %xmm1
-
- movaps %xmm2, -0x20(%rdi)
- lea -0x40(%rdi), %rdi
-
- movaps %xmm3, 0x10(%rdi)
- jb L(shl_15_bwd_end)
- movaps %xmm4, (%rdi)
- _CET_NOTRACK jmp *%r9
- ud2
-L(shl_15_bwd_end):
- movaps %xmm4, (%rdi)
- lea 64(%rdx), %rdx
- movdqu %xmm0, (%r8)
- BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
- .p2align 4
-L(write_72bytes):
- movdqu -72(%rsi), %xmm0
- movdqu -56(%rsi), %xmm1
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -72(%rdi)
- movdqu %xmm1, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_64bytes):
- movdqu -64(%rsi), %xmm0
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- movdqu %xmm0, -64(%rdi)
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_56bytes):
- movdqu -56(%rsi), %xmm0
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rcx
- movdqu %xmm0, -56(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rcx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_48bytes):
- mov -48(%rsi), %rcx
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %rcx, -48(%rdi)
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_40bytes):
- mov -40(%rsi), %r8
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r8, -40(%rdi)
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_32bytes):
- mov -32(%rsi), %r9
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r9, -32(%rdi)
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_24bytes):
- mov -24(%rsi), %r10
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r10, -24(%rdi)
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_16bytes):
- mov -16(%rsi), %r11
- mov -8(%rsi), %rdx
- mov %r11, -16(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_8bytes):
- mov -8(%rsi), %rdx
- mov %rdx, -8(%rdi)
-L(write_0bytes):
- ret
-
- .p2align 4
-L(write_73bytes):
- movdqu -73(%rsi), %xmm0
- movdqu -57(%rsi), %xmm1
- mov -41(%rsi), %rcx
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %r8
- mov -4(%rsi), %edx
- movdqu %xmm0, -73(%rdi)
- movdqu %xmm1, -57(%rdi)
- mov %rcx, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %r8, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_65bytes):
- movdqu -65(%rsi), %xmm0
- movdqu -49(%rsi), %xmm1
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -65(%rdi)
- movdqu %xmm1, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_57bytes):
- movdqu -57(%rsi), %xmm0
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -57(%rdi)
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_49bytes):
- movdqu -49(%rsi), %xmm0
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -49(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_41bytes):
- mov -41(%rsi), %r8
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r8, -41(%rdi)
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_33bytes):
- mov -33(%rsi), %r9
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r9, -33(%rdi)
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_25bytes):
- mov -25(%rsi), %r10
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -1(%rsi), %dl
- mov %r10, -25(%rdi)
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_17bytes):
- mov -17(%rsi), %r11
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -17(%rdi)
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_9bytes):
- mov -9(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -9(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_1bytes):
- mov -1(%rsi), %dl
- mov %dl, -1(%rdi)
- ret
-
- .p2align 4
-L(write_74bytes):
- movdqu -74(%rsi), %xmm0
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -74(%rdi)
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_66bytes):
- movdqu -66(%rsi), %xmm0
- movdqu -50(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -66(%rdi)
- movdqu %xmm1, -50(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_58bytes):
- movdqu -58(%rsi), %xmm1
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm1, -58(%rdi)
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_50bytes):
- movdqu -50(%rsi), %xmm0
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -50(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_42bytes):
- mov -42(%rsi), %r8
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -42(%rdi)
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_34bytes):
- mov -34(%rsi), %r9
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -34(%rdi)
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_26bytes):
- mov -26(%rsi), %r10
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -26(%rdi)
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_18bytes):
- mov -18(%rsi), %r11
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -18(%rdi)
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_10bytes):
- mov -10(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -10(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_2bytes):
- mov -2(%rsi), %dx
- mov %dx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_75bytes):
- movdqu -75(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -75(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_67bytes):
- movdqu -67(%rsi), %xmm0
- movdqu -59(%rsi), %xmm1
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -67(%rdi)
- movdqu %xmm1, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_59bytes):
- movdqu -59(%rsi), %xmm0
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -59(%rdi)
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_51bytes):
- movdqu -51(%rsi), %xmm0
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -51(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_43bytes):
- mov -43(%rsi), %r8
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -43(%rdi)
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_35bytes):
- mov -35(%rsi), %r9
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -35(%rdi)
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_27bytes):
- mov -27(%rsi), %r10
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -27(%rdi)
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_19bytes):
- mov -19(%rsi), %r11
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -19(%rdi)
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_11bytes):
- mov -11(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -11(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_3bytes):
- mov -3(%rsi), %dx
- mov -2(%rsi), %cx
- mov %dx, -3(%rdi)
- mov %cx, -2(%rdi)
- ret
-
- .p2align 4
-L(write_76bytes):
- movdqu -76(%rsi), %xmm0
- movdqu -60(%rsi), %xmm1
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -76(%rdi)
- movdqu %xmm1, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_68bytes):
- movdqu -68(%rsi), %xmm0
- movdqu -52(%rsi), %xmm1
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -68(%rdi)
- movdqu %xmm1, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_60bytes):
- movdqu -60(%rsi), %xmm0
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -60(%rdi)
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_52bytes):
- movdqu -52(%rsi), %xmm0
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- movdqu %xmm0, -52(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_44bytes):
- mov -44(%rsi), %r8
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r8, -44(%rdi)
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_36bytes):
- mov -36(%rsi), %r9
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r9, -36(%rdi)
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_28bytes):
- mov -28(%rsi), %r10
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r10, -28(%rdi)
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_20bytes):
- mov -20(%rsi), %r11
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %r11, -20(%rdi)
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_12bytes):
- mov -12(%rsi), %rcx
- mov -4(%rsi), %edx
- mov %rcx, -12(%rdi)
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_4bytes):
- mov -4(%rsi), %edx
- mov %edx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_77bytes):
- movdqu -77(%rsi), %xmm0
- movdqu -61(%rsi), %xmm1
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -77(%rdi)
- movdqu %xmm1, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_69bytes):
- movdqu -69(%rsi), %xmm0
- movdqu -53(%rsi), %xmm1
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -69(%rdi)
- movdqu %xmm1, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_61bytes):
- movdqu -61(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -61(%rdi)
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_53bytes):
- movdqu -53(%rsi), %xmm0
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -53(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_45bytes):
- mov -45(%rsi), %r8
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -45(%rdi)
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_37bytes):
- mov -37(%rsi), %r9
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -37(%rdi)
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_29bytes):
- mov -29(%rsi), %r10
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -29(%rdi)
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_21bytes):
- mov -21(%rsi), %r11
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -21(%rdi)
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_13bytes):
- mov -13(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -13(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_5bytes):
- mov -5(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -5(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_78bytes):
- movdqu -78(%rsi), %xmm0
- movdqu -62(%rsi), %xmm1
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -78(%rdi)
- movdqu %xmm1, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_70bytes):
- movdqu -70(%rsi), %xmm0
- movdqu -54(%rsi), %xmm1
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -70(%rdi)
- movdqu %xmm1, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_62bytes):
- movdqu -62(%rsi), %xmm0
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -62(%rdi)
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_54bytes):
- movdqu -54(%rsi), %xmm0
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -54(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_46bytes):
- mov -46(%rsi), %r8
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -46(%rdi)
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_38bytes):
- mov -38(%rsi), %r9
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -38(%rdi)
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_30bytes):
- mov -30(%rsi), %r10
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -30(%rdi)
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_22bytes):
- mov -22(%rsi), %r11
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -22(%rdi)
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_14bytes):
- mov -14(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -14(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_6bytes):
- mov -6(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -6(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(write_79bytes):
- movdqu -79(%rsi), %xmm0
- movdqu -63(%rsi), %xmm1
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -79(%rdi)
- movdqu %xmm1, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_71bytes):
- movdqu -71(%rsi), %xmm0
- movdqu -55(%rsi), %xmm1
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -71(%rdi)
- movdqu %xmm1, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_63bytes):
- movdqu -63(%rsi), %xmm0
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -63(%rdi)
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_55bytes):
- movdqu -55(%rsi), %xmm0
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- movdqu %xmm0, -55(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_47bytes):
- mov -47(%rsi), %r8
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r8, -47(%rdi)
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_39bytes):
- mov -39(%rsi), %r9
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r9, -39(%rdi)
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_31bytes):
- mov -31(%rsi), %r10
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r10, -31(%rdi)
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_23bytes):
- mov -23(%rsi), %r11
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %r11, -23(%rdi)
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_15bytes):
- mov -15(%rsi), %rcx
- mov -8(%rsi), %rdx
- mov %rcx, -15(%rdi)
- mov %rdx, -8(%rdi)
- ret
-
- .p2align 4
-L(write_7bytes):
- mov -7(%rsi), %edx
- mov -4(%rsi), %ecx
- mov %edx, -7(%rdi)
- mov %ecx, -4(%rdi)
- ret
-
- .p2align 4
-L(large_page_fwd):
- movdqu (%rsi), %xmm1
- lea 16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movntdq %xmm1, (%rdi)
- lea 16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rsi, %r9
- sub %rdi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_fwd)
- shl $2, %rcx
- cmp %rcx, %rdx
- jb L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- movntdq %xmm4, 0x40(%rdi)
- movntdq %xmm5, 0x50(%rdi)
- movntdq %xmm6, 0x60(%rdi)
- movntdq %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(large_page_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movntdq %xmm0, (%rdi)
- movntdq %xmm1, 0x10(%rdi)
- movntdq %xmm2, 0x20(%rdi)
- movntdq %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_fwd_start):
- prefetcht0 0x1c0(%rsi)
- prefetcht0 0x200(%rsi)
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- movdqu 0x40(%rsi), %xmm4
- movdqu 0x50(%rsi), %xmm5
- movdqu 0x60(%rsi), %xmm6
- movdqu 0x70(%rsi), %xmm7
- lea 0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- movaps %xmm4, 0x40(%rdi)
- movaps %xmm5, 0x50(%rdi)
- movaps %xmm6, 0x60(%rdi)
- movaps %xmm7, 0x70(%rdi)
- lea 0x80(%rdi), %rdi
- jae L(ll_cache_copy_fwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_fwd_64bytes)
-
- movdqu (%rsi), %xmm0
- movdqu 0x10(%rsi), %xmm1
- movdqu 0x20(%rsi), %xmm2
- movdqu 0x30(%rsi), %xmm3
- lea 0x40(%rsi), %rsi
-
- movaps %xmm0, (%rdi)
- movaps %xmm1, 0x10(%rdi)
- movaps %xmm2, 0x20(%rdi)
- movaps %xmm3, 0x30(%rdi)
- lea 0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
- add %rdx, %rsi
- add %rdx, %rdi
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
- .p2align 4
-L(large_page_bwd):
- movdqu -0x10(%rsi), %xmm1
- lea -16(%rsi), %rsi
- movdqu %xmm0, (%r8)
- movdqa %xmm1, -0x10(%rdi)
- lea -16(%rdi), %rdi
- lea -0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
- mov %rdi, %r9
- sub %rsi, %r9
- cmp %rdx, %r9
- jae L(memmove_is_memcpy_bwd)
- cmp %rcx, %r9
- jb L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- movntdq %xmm4, -0x50(%rdi)
- movntdq %xmm5, -0x60(%rdi)
- movntdq %xmm6, -0x70(%rdi)
- movntdq %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(large_page_bwd_loop)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movntdq %xmm0, -0x10(%rdi)
- movntdq %xmm1, -0x20(%rdi)
- movntdq %xmm2, -0x30(%rdi)
- movntdq %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_less_bwd_64bytes):
- sfence
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
- .p2align 4
-L(ll_cache_copy_bwd_start):
- prefetcht0 -0x1c0(%rsi)
- prefetcht0 -0x200(%rsi)
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- movdqu -0x50(%rsi), %xmm4
- movdqu -0x60(%rsi), %xmm5
- movdqu -0x70(%rsi), %xmm6
- movdqu -0x80(%rsi), %xmm7
- lea -0x80(%rsi), %rsi
-
- sub $0x80, %rdx
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- movaps %xmm4, -0x50(%rdi)
- movaps %xmm5, -0x60(%rdi)
- movaps %xmm6, -0x70(%rdi)
- movaps %xmm7, -0x80(%rdi)
- lea -0x80(%rdi), %rdi
- jae L(ll_cache_copy_bwd_start)
- cmp $-0x40, %rdx
- lea 0x80(%rdx), %rdx
- jl L(large_page_ll_less_bwd_64bytes)
-
- movdqu -0x10(%rsi), %xmm0
- movdqu -0x20(%rsi), %xmm1
- movdqu -0x30(%rsi), %xmm2
- movdqu -0x40(%rsi), %xmm3
- lea -0x40(%rsi), %rsi
-
- movaps %xmm0, -0x10(%rdi)
- movaps %xmm1, -0x20(%rdi)
- movaps %xmm2, -0x30(%rdi)
- movaps %xmm3, -0x40(%rdi)
- lea -0x40(%rdi), %rdi
- sub $0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
- BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
- .section .rodata.ssse3,"a",@progbits
- .p2align 3
-L(table_less_80bytes):
- .int JMPTBL (L(write_0bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_1bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_2bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_3bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_4bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_5bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_6bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_7bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_8bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_9bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_10bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_11bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_12bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_13bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_14bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_15bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_16bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_17bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_18bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_19bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_20bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_21bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_22bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_23bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_24bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_25bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_26bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_27bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_28bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_29bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_30bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_31bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_32bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_33bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_34bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_35bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_36bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_37bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_38bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_39bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_40bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_41bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_42bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_43bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_44bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_45bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_46bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_47bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_48bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_49bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_50bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_51bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_52bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_53bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_54bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_55bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_56bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_57bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_58bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_59bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_60bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_61bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_62bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_63bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_64bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_65bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_66bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_67bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_68bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_69bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_70bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_71bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_72bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_73bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_74bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_75bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_76bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_77bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_78bytes), L(table_less_80bytes))
- .int JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
- .p2align 3
-L(shl_table):
- .int JMPTBL (L(shl_0), L(shl_table))
- .int JMPTBL (L(shl_1), L(shl_table))
- .int JMPTBL (L(shl_2), L(shl_table))
- .int JMPTBL (L(shl_3), L(shl_table))
- .int JMPTBL (L(shl_4), L(shl_table))
- .int JMPTBL (L(shl_5), L(shl_table))
- .int JMPTBL (L(shl_6), L(shl_table))
- .int JMPTBL (L(shl_7), L(shl_table))
- .int JMPTBL (L(shl_8), L(shl_table))
- .int JMPTBL (L(shl_9), L(shl_table))
- .int JMPTBL (L(shl_10), L(shl_table))
- .int JMPTBL (L(shl_11), L(shl_table))
- .int JMPTBL (L(shl_12), L(shl_table))
- .int JMPTBL (L(shl_13), L(shl_table))
- .int JMPTBL (L(shl_14), L(shl_table))
- .int JMPTBL (L(shl_15), L(shl_table))
-
- .p2align 3
-L(shl_table_bwd):
- .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
- .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..215583e7bd 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,380 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY __memmove_ssse3
-#define MEMCPY_CHK __memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE __memmove_ssse3
+# define MEMMOVE_CHK __memmove_chk_ssse3
+# define MEMCPY __memcpy_ssse3
+# define MEMCPY_CHK __memcpy_chk_ssse3
+# define MEMPCPY __mempcpy_ssse3
+# define MEMPCPY_CHK __mempcpy_chk_ssse3
+#endif
+
+ .section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
+ jmp L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+ cmp %RDX_LP, %RCX_LP
+ jb HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+ movq %rdi, %rax
+L(start):
+ cmpq $16, %rdx
+ jb L(copy_0_15)
+
+ /* These loads are always useful. */
+ movups 0(%rsi), %xmm0
+ movups -16(%rsi, %rdx), %xmm7
+ cmpq $32, %rdx
+ ja L(more_2x_vec)
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm7, -16(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 4
+L(copy_0_15):
+ cmpl $4, %edx
+ jb L(copy_0_3)
+ cmpl $8, %edx
+ jb L(copy_4_7)
+ movq 0(%rsi), %rcx
+ movq -8(%rsi, %rdx), %rsi
+ movq %rcx, 0(%rdi)
+ movq %rsi, -8(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 4
+L(copy_4_7):
+ movl 0(%rsi), %ecx
+ movl -4(%rsi, %rdx), %esi
+ movl %ecx, 0(%rdi)
+ movl %esi, -4(%rdi, %rdx)
+ ret
+
+ .p2align 4,, 4
+L(copy_0_3):
+ decl %edx
+ jl L(copy_0_0)
+ movb (%rsi), %cl
+ je L(copy_1_1)
+
+ movzwl -1(%rsi, %rdx), %esi
+ movw %si, -1(%rdi, %rdx)
+L(copy_1_1):
+ movb %cl, (%rdi)
+L(copy_0_0):
+ ret
+
+ .p2align 4,, 4
+L(copy_4x_vec):
+ movups 16(%rsi), %xmm1
+ movups -32(%rsi, %rdx), %xmm2
+
+ movups %xmm0, 0(%rdi)
+ movups %xmm1, 16(%rdi)
+ movups %xmm2, -32(%rdi, %rdx)
+ movups %xmm7, -16(%rdi, %rdx)
+L(nop):
+ ret
+
+ .p2align 4
+L(more_2x_vec):
+ cmpq $64, %rdx
+ jbe L(copy_4x_vec)
+
+ /* We use rcx later to get alignr value. */
+ movq %rdi, %rcx
+
+ /* Backward copy for overlap + dst > src for memmove safety. */
+ subq %rsi, %rcx
+ cmpq %rdx, %rcx
+ jb L(copy_backward)
+
+ /* Load tail. */
+
+ /* -16(%rsi, %rdx) already loaded into xmm7. */
+ movups -32(%rsi, %rdx), %xmm8
+ movups -48(%rsi, %rdx), %xmm9
+
+ /* Get misalignment. */
+ andl $0xf, %ecx
+
+ movq %rsi, %r9
+ addq %rcx, %rsi
+ andq $-16, %rsi
+ /* Get first vec for `palignr`. */
+ movaps (%rsi), %xmm1
+
+ /* We have loaded (%rsi) so safe to do this store before the
+ loop. */
+ movups %xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+ cmp $SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+ cmp __x86_shared_cache_size_half(%rip), %rdx
+#endif
+ ja L(large_memcpy)
+
+ leaq -64(%rdi, %rdx), %r8
+ andq $-16, %rdi
+ movl $48, %edx
+
+ leaq L(loop_fwd_start)(%rip), %r9
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(copy_backward):
+ testq %rcx, %rcx
+ jz L(nop)
+
+ /* Preload tail. */
+
+ /* (%rsi) already loaded into xmm0. */
+ movups 16(%rsi), %xmm4
+ movups 32(%rsi), %xmm5
+
+ movq %rdi, %r8
+ subq %rdi, %rsi
+ leaq -49(%rdi, %rdx), %rdi
+ andq $-16, %rdi
+ addq %rdi, %rsi
+ andq $-16, %rsi
+
+ movaps 48(%rsi), %xmm6
+
+
+ leaq L(loop_bkwd_start)(%rip), %r9
+ andl $0xf, %ecx
+ sall $6, %ecx
+ addq %r9, %rcx
+ jmp * %rcx
+
+ .p2align 4,, 8
+L(large_memcpy):
+ movups -64(%r9, %rdx), %xmm10
+ movups -80(%r9, %rdx), %xmm11
+
+ sall $5, %ecx
+ leal (%rcx, %rcx, 2), %r8d
+ leaq -96(%rdi, %rdx), %rcx
+ andq $-16, %rdi
+ leaq L(large_loop_fwd_start)(%rip), %rdx
+ addq %r8, %rdx
+ jmp * %rdx
+
+
+ /* Instead of a typical jump table all 16 loops are exactly
+ 64-bytes in size. So, we can just jump to first loop + r8 *
+ 64. Before modifying any loop ensure all their sizes match!
+ */
+ .p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps %xmm1, 16(%rdi)
+ movaps %xmm2, 32(%rdi)
+ movaps %xmm3, 48(%rdi)
+ addq %rdx, %rdi
+ addq %rdx, %rsi
+ cmpq %rdi, %r8
+ ja L(loop_fwd_0x0)
+L(end_loop_fwd):
+ movups %xmm9, 16(%r8)
+ movups %xmm8, 32(%r8)
+ movups %xmm7, 48(%r8)
+ ret
+
+ /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_FWD(align_by); \
+ .p2align 6; \
+L(loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm4, %xmm1; \
+ movaps %xmm0, 16(%rdi); \
+ movaps %xmm2, 32(%rdi); \
+ movaps %xmm3, 48(%rdi); \
+ addq %rdx, %rdi; \
+ addq %rdx, %rsi; \
+ cmpq %rdi, %r8; \
+ ja L(loop_fwd_ ## align_by); \
+ jmp L(end_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_FWD (0xf)
+ ALIGNED_LOOP_FWD (0xe)
+ ALIGNED_LOOP_FWD (0xd)
+ ALIGNED_LOOP_FWD (0xc)
+ ALIGNED_LOOP_FWD (0xb)
+ ALIGNED_LOOP_FWD (0xa)
+ ALIGNED_LOOP_FWD (0x9)
+ ALIGNED_LOOP_FWD (0x8)
+ ALIGNED_LOOP_FWD (0x7)
+ ALIGNED_LOOP_FWD (0x6)
+ ALIGNED_LOOP_FWD (0x5)
+ ALIGNED_LOOP_FWD (0x4)
+ ALIGNED_LOOP_FWD (0x3)
+ ALIGNED_LOOP_FWD (0x2)
+ ALIGNED_LOOP_FWD (0x1)
+
+ .p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+ movaps 16(%rsi), %xmm1
+ movaps 32(%rsi), %xmm2
+ movaps 48(%rsi), %xmm3
+ movaps 64(%rsi), %xmm4
+ movaps 80(%rsi), %xmm5
+ movntps %xmm1, 16(%rdi)
+ movntps %xmm2, 32(%rdi)
+ movntps %xmm3, 48(%rdi)
+ movntps %xmm4, 64(%rdi)
+ movntps %xmm5, 80(%rdi)
+ addq $80, %rdi
+ addq $80, %rsi
+ cmpq %rdi, %rcx
+ ja L(large_loop_fwd_0x0)
+
+ /* Ensure no icache line split on tail. */
+ .p2align 4
+L(end_large_loop_fwd):
+ sfence
+ movups %xmm11, 16(%rcx)
+ movups %xmm10, 32(%rcx)
+ movups %xmm9, 48(%rcx)
+ movups %xmm8, 64(%rcx)
+ movups %xmm7, 80(%rcx)
+ ret
+
+
+ /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+ 96-byte spacing between each. */
+#define ALIGNED_LARGE_LOOP_FWD(align_by); \
+ .p2align 5; \
+L(large_loop_fwd_ ## align_by): \
+ movaps 16(%rsi), %xmm0; \
+ movaps 32(%rsi), %xmm2; \
+ movaps 48(%rsi), %xmm3; \
+ movaps 64(%rsi), %xmm4; \
+ movaps 80(%rsi), %xmm5; \
+ movaps %xmm5, %xmm6; \
+ palignr $align_by, %xmm4, %xmm5; \
+ palignr $align_by, %xmm3, %xmm4; \
+ palignr $align_by, %xmm2, %xmm3; \
+ palignr $align_by, %xmm0, %xmm2; \
+ palignr $align_by, %xmm1, %xmm0; \
+ movaps %xmm6, %xmm1; \
+ movntps %xmm0, 16(%rdi); \
+ movntps %xmm2, 32(%rdi); \
+ movntps %xmm3, 48(%rdi); \
+ movntps %xmm4, 64(%rdi); \
+ movntps %xmm5, 80(%rdi); \
+ addq $80, %rdi; \
+ addq $80, %rsi; \
+ cmpq %rdi, %rcx; \
+ ja L(large_loop_fwd_ ## align_by); \
+ jmp L(end_large_loop_fwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LARGE_LOOP_FWD (0xf)
+ ALIGNED_LARGE_LOOP_FWD (0xe)
+ ALIGNED_LARGE_LOOP_FWD (0xd)
+ ALIGNED_LARGE_LOOP_FWD (0xc)
+ ALIGNED_LARGE_LOOP_FWD (0xb)
+ ALIGNED_LARGE_LOOP_FWD (0xa)
+ ALIGNED_LARGE_LOOP_FWD (0x9)
+ ALIGNED_LARGE_LOOP_FWD (0x8)
+ ALIGNED_LARGE_LOOP_FWD (0x7)
+ ALIGNED_LARGE_LOOP_FWD (0x6)
+ ALIGNED_LARGE_LOOP_FWD (0x5)
+ ALIGNED_LARGE_LOOP_FWD (0x4)
+ ALIGNED_LARGE_LOOP_FWD (0x3)
+ ALIGNED_LARGE_LOOP_FWD (0x2)
+ ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+ .p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+ movaps 32(%rsi), %xmm1
+ movaps 16(%rsi), %xmm2
+ movaps 0(%rsi), %xmm3
+ movaps %xmm1, 32(%rdi)
+ movaps %xmm2, 16(%rdi)
+ movaps %xmm3, 0(%rdi)
+ subq $48, %rdi
+ subq $48, %rsi
+ cmpq %rdi, %r8
+ jb L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+ movups %xmm7, -16(%r8, %rdx)
+ movups %xmm0, 0(%r8)
+ movups %xmm4, 16(%r8)
+ movups %xmm5, 32(%r8)
+
+ ret
+
+
+ /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+ 60 bytes otherwise. */
+#define ALIGNED_LOOP_BKWD(align_by); \
+ .p2align 6; \
+L(loop_bkwd_ ## align_by): \
+ movaps 32(%rsi), %xmm1; \
+ movaps 16(%rsi), %xmm2; \
+ movaps 0(%rsi), %xmm3; \
+ palignr $align_by, %xmm1, %xmm6; \
+ palignr $align_by, %xmm2, %xmm1; \
+ palignr $align_by, %xmm3, %xmm2; \
+ movaps %xmm6, 32(%rdi); \
+ movaps %xmm1, 16(%rdi); \
+ movaps %xmm2, 0(%rdi); \
+ subq $48, %rdi; \
+ subq $48, %rsi; \
+ movaps %xmm3, %xmm6; \
+ cmpq %rdi, %r8; \
+ jb L(loop_bkwd_ ## align_by); \
+ jmp L(end_loop_bkwd);
+
+ /* Must be in descending order. */
+ ALIGNED_LOOP_BKWD (0xf)
+ ALIGNED_LOOP_BKWD (0xe)
+ ALIGNED_LOOP_BKWD (0xd)
+ ALIGNED_LOOP_BKWD (0xc)
+ ALIGNED_LOOP_BKWD (0xb)
+ ALIGNED_LOOP_BKWD (0xa)
+ ALIGNED_LOOP_BKWD (0x9)
+ ALIGNED_LOOP_BKWD (0x8)
+ ALIGNED_LOOP_BKWD (0x7)
+ ALIGNED_LOOP_BKWD (0x6)
+ ALIGNED_LOOP_BKWD (0x5)
+ ALIGNED_LOOP_BKWD (0x4)
+ ALIGNED_LOOP_BKWD (0x3)
+ ALIGNED_LOOP_BKWD (0x2)
+ ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
--
2.25.1
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
` (4 preceding siblings ...)
2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-14 18:04 ` H.J. Lu
5 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-04-14 18:04 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
> sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
> sysdeps/x86_64/multiarch/memcmp-ssse3.S | 1992 --------------------
> sysdeps/x86_64/multiarch/wmemcmp-ssse3.S | 4 -
> 5 files changed, 2006 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 6507d1b7fa..51222dfab1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -12,7 +12,6 @@ sysdep_routines += \
> memcmp-evex-movbe \
> memcmp-sse2 \
> memcmp-sse4 \
> - memcmp-ssse3 \
> memcmpeq-avx2 \
> memcmpeq-avx2-rtm \
> memcmpeq-evex \
> @@ -179,7 +178,6 @@ sysdep_routines += \
> wmemcmp-c \
> wmemcmp-evex-movbe \
> wmemcmp-sse4 \
> - wmemcmp-ssse3 \
> # sysdep_routines
> endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 40cc6cc49e..f389928a4e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __memcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> __memcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
> - __memcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
> #ifdef SHARED
> @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __wmemcmp_evex_movbe)
> IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> __wmemcmp_sse4_1)
> - IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
> - __wmemcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/wmemset.c. */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index cd12613699..44759a3ad5 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> return OPTIMIZE (sse4_1);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> deleted file mode 100644
> index df1b1fc494..0000000000
> --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> +++ /dev/null
> @@ -1,1992 +0,0 @@
> -/* memcmp with SSSE3, wmemcmp with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -# define MEMCMP __memcmp_ssse3
> -# endif
> -
> -/* Warning!
> - wmemcmp has to use SIGNED comparison for elements.
> - memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> - atom_text_section
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> - shl $2, %RDX_LP
> - test %RDX_LP, %RDX_LP
> - jz L(equal)
> -# elif defined __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -# endif
> - mov %rdx, %rcx
> - mov %rdi, %rdx
> - cmp $48, %rcx;
> - jae L(48bytesormore) /* LEN => 48 */
> -
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -/* ECX >= 32. */
> -L(48bytesormore):
> - movdqu (%rdi), %xmm3
> - movdqu (%rsi), %xmm0
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 16(%rdi), %rdi
> - lea 16(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(less16bytes)
> - mov %edi, %edx
> - and $0xf, %edx
> - xor %rdx, %rdi
> - sub %rdx, %rsi
> - add %rdx, %rcx
> - mov %esi, %edx
> - and $0xf, %edx
> - jz L(shr_0)
> - xor %rdx, %rsi
> -
> -# ifndef USE_AS_WMEMCMP
> - cmp $8, %edx
> - jae L(next_unaligned_table)
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $1, %edx
> - je L(shr_1)
> - cmp $2, %edx
> - je L(shr_2)
> - cmp $3, %edx
> - je L(shr_3)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $5, %edx
> - je L(shr_5)
> - cmp $6, %edx
> - je L(shr_6)
> - jmp L(shr_7)
> -
> - .p2align 2
> -L(next_unaligned_table):
> - cmp $8, %edx
> - je L(shr_8)
> - cmp $9, %edx
> - je L(shr_9)
> - cmp $10, %edx
> - je L(shr_10)
> - cmp $11, %edx
> - je L(shr_11)
> - cmp $12, %edx
> - je L(shr_12)
> - cmp $13, %edx
> - je L(shr_13)
> - cmp $14, %edx
> - je L(shr_14)
> - jmp L(shr_15)
> -# else
> - cmp $0, %edx
> - je L(shr_0)
> - cmp $4, %edx
> - je L(shr_4)
> - cmp $8, %edx
> - je L(shr_8)
> - jmp L(shr_12)
> -# endif
> -
> - .p2align 4
> -L(shr_0):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - jae L(shr_0_gobble)
> - xor %eax, %eax
> - movdqa (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> - pand %xmm1, %xmm2
> - pmovmskb %xmm2, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_0_gobble):
> - movdqa (%rsi), %xmm0
> - xor %eax, %eax
> - pcmpeqb (%rdi), %xmm0
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm2
> - pcmpeqb 16(%rdi), %xmm2
> -L(shr_0_gobble_loop):
> - pand %xmm0, %xmm2
> - sub $32, %rcx
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - movdqa 32(%rsi), %xmm0
> - movdqa 48(%rsi), %xmm2
> - sbb $0xffff, %edx
> - pcmpeqb 32(%rdi), %xmm0
> - pcmpeqb 48(%rdi), %xmm2
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - jz L(shr_0_gobble_loop)
> -
> - pand %xmm0, %xmm2
> - cmp $0, %rcx
> - jge L(next)
> - inc %edx
> - add $32, %rcx
> -L(next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm2, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_1):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_1_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $1, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $1, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_1_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $1, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $1, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_1_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $1, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $1, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_1_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_1_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_1_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 1(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -
> - .p2align 4
> -L(shr_2):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_2_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $2, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $2, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_2_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $2, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $2, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_2_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $2, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $2, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_2_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_2_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_2_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 2(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_3_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $3, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $3, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_3_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $3, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $3, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_3_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $3, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $3, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_3_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_3_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_3_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 3(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_4):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_4_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $4, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $4, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_4_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $4, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $4, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_4_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $4, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $4, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_4_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_4_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_4_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 4(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_5):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_5_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $5, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $5, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_5_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $5, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $5, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_5_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $5, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $5, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_5_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_5_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_5_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 5(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_6_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $6, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $6, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_6_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $6, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $6, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_6_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $6, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $6, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_6_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_6_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_6_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 6(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_7_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $7, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $7, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_7_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $7, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $7, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_7_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $7, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $7, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_7_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_7_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_7_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 7(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_8):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_8_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $8, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $8, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_8_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $8, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $8, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_8_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $8, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $8, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_8_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_8_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_8_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 8(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_9):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_9_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $9, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $9, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_9_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $9, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $9, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_9_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $9, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $9, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_9_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_9_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_9_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 9(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_10_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $10, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $10, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_10_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $10, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $10, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_10_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $10, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $10, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_10_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_10_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_10_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 10(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_11_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $11, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $11, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_11_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $11, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $11, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_11_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $11, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $11, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_11_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_11_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_11_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 11(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# endif
> -
> - .p2align 4
> -L(shr_12):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_12_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $12, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $12, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_12_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $12, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $12, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_12_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $12, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $12, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_12_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_12_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_12_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 12(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> - .p2align 4
> -L(shr_13):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_13_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $13, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $13, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_13_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $13, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $13, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_13_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $13, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $13, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_13_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_13_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_13_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 13(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_14_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $14, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $14, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_14_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $14, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $14, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_14_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $14, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $14, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_14_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_14_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_14_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 14(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15):
> - cmp $80, %rcx
> - lea -48(%rcx), %rcx
> - mov %edx, %eax
> - jae L(shr_15_gobble)
> -
> - movdqa 16(%rsi), %xmm1
> - movdqa %xmm1, %xmm2
> - palignr $15, (%rsi), %xmm1
> - pcmpeqb (%rdi), %xmm1
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, %xmm2, %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> - pand %xmm1, %xmm3
> - pmovmskb %xmm3, %edx
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> - add $15, %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -
> - .p2align 4
> -L(shr_15_gobble):
> - sub $32, %rcx
> - movdqa 16(%rsi), %xmm0
> - palignr $15, (%rsi), %xmm0
> - pcmpeqb (%rdi), %xmm0
> -
> - movdqa 32(%rsi), %xmm3
> - palignr $15, 16(%rsi), %xmm3
> - pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_15_gobble_loop):
> - pand %xmm0, %xmm3
> - sub $32, %rcx
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> -
> - movdqa 64(%rsi), %xmm3
> - palignr $15, 48(%rsi), %xmm3
> - sbb $0xffff, %edx
> - movdqa 48(%rsi), %xmm0
> - palignr $15, 32(%rsi), %xmm0
> - pcmpeqb 32(%rdi), %xmm0
> - lea 32(%rsi), %rsi
> - pcmpeqb 48(%rdi), %xmm3
> -
> - lea 32(%rdi), %rdi
> - jz L(shr_15_gobble_loop)
> - pand %xmm0, %xmm3
> -
> - cmp $0, %rcx
> - jge L(shr_15_gobble_next)
> - inc %edx
> - add $32, %rcx
> -L(shr_15_gobble_next):
> - test %edx, %edx
> - jnz L(exit)
> -
> - pmovmskb %xmm3, %edx
> - movdqa %xmm0, %xmm1
> - lea 32(%rdi), %rdi
> - lea 32(%rsi), %rsi
> - sub $0xffff, %edx
> - jnz L(exit)
> -
> - lea 15(%rsi), %rsi
> - add %rcx, %rsi
> - add %rcx, %rdi
> - jmp L(less48bytes)
> -# endif
> - .p2align 4
> -L(exit):
> - pmovmskb %xmm1, %r8d
> - sub $0xffff, %r8d
> - jz L(first16bytes)
> - lea -16(%rsi), %rsi
> - lea -16(%rdi), %rdi
> - mov %r8d, %edx
> -L(first16bytes):
> - add %rax, %rsi
> -L(less16bytes):
> -# ifndef USE_AS_WMEMCMP
> - test %dl, %dl
> - jz L(next_24_bytes)
> -
> - test $0x01, %dl
> - jnz L(Byte16)
> -
> - test $0x02, %dl
> - jnz L(Byte17)
> -
> - test $0x04, %dl
> - jnz L(Byte18)
> -
> - test $0x08, %dl
> - jnz L(Byte19)
> -
> - test $0x10, %dl
> - jnz L(Byte20)
> -
> - test $0x20, %dl
> - jnz L(Byte21)
> -
> - test $0x40, %dl
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte16):
> - movzbl -16(%rdi), %eax
> - movzbl -16(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte17):
> - movzbl -15(%rdi), %eax
> - movzbl -15(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte18):
> - movzbl -14(%rdi), %eax
> - movzbl -14(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte19):
> - movzbl -13(%rdi), %eax
> - movzbl -13(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte20):
> - movzbl -12(%rdi), %eax
> - movzbl -12(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte21):
> - movzbl -11(%rdi), %eax
> - movzbl -11(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(Byte22):
> - movzbl -10(%rdi), %eax
> - movzbl -10(%rsi), %edx
> - sub %edx, %eax
> - ret
> -
> - .p2align 4
> -L(next_24_bytes):
> - lea 8(%rdi), %rdi
> - lea 8(%rsi), %rsi
> - test $0x01, %dh
> - jnz L(Byte16)
> -
> - test $0x02, %dh
> - jnz L(Byte17)
> -
> - test $0x04, %dh
> - jnz L(Byte18)
> -
> - test $0x08, %dh
> - jnz L(Byte19)
> -
> - test $0x10, %dh
> - jnz L(Byte20)
> -
> - test $0x20, %dh
> - jnz L(Byte21)
> -
> - test $0x40, %dh
> - jnz L(Byte22)
> -
> - movzbl -9(%rdi), %eax
> - movzbl -9(%rsi), %edx
> - sub %edx, %eax
> - ret
> -# else
> -/* special for wmemcmp */
> - xor %eax, %eax
> - test %dl, %dl
> - jz L(next_two_double_words)
> - and $15, %dl
> - jz L(second_double_word)
> - mov -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(second_double_word):
> - mov -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(next_two_double_words):
> - and $15, %dh
> - jz L(fourth_double_word)
> - mov -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> - ret
> -
> - .p2align 4
> -L(fourth_double_word):
> - mov -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> - ret
> -# endif
> -
> - .p2align 4
> -L(less48bytes):
> - cmp $8, %ecx
> - jae L(more8bytes)
> - cmp $0, %ecx
> - je L(0bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $1, %ecx
> - je L(1bytes)
> - cmp $2, %ecx
> - je L(2bytes)
> - cmp $3, %ecx
> - je L(3bytes)
> - cmp $4, %ecx
> - je L(4bytes)
> - cmp $5, %ecx
> - je L(5bytes)
> - cmp $6, %ecx
> - je L(6bytes)
> - jmp L(7bytes)
> -# else
> - jmp L(4bytes)
> -# endif
> -
> - .p2align 4
> -L(more8bytes):
> - cmp $16, %ecx
> - jae L(more16bytes)
> - cmp $8, %ecx
> - je L(8bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $9, %ecx
> - je L(9bytes)
> - cmp $10, %ecx
> - je L(10bytes)
> - cmp $11, %ecx
> - je L(11bytes)
> - cmp $12, %ecx
> - je L(12bytes)
> - cmp $13, %ecx
> - je L(13bytes)
> - cmp $14, %ecx
> - je L(14bytes)
> - jmp L(15bytes)
> -# else
> - jmp L(12bytes)
> -# endif
> -
> - .p2align 4
> -L(more16bytes):
> - cmp $24, %ecx
> - jae L(more24bytes)
> - cmp $16, %ecx
> - je L(16bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $17, %ecx
> - je L(17bytes)
> - cmp $18, %ecx
> - je L(18bytes)
> - cmp $19, %ecx
> - je L(19bytes)
> - cmp $20, %ecx
> - je L(20bytes)
> - cmp $21, %ecx
> - je L(21bytes)
> - cmp $22, %ecx
> - je L(22bytes)
> - jmp L(23bytes)
> -# else
> - jmp L(20bytes)
> -# endif
> -
> - .p2align 4
> -L(more24bytes):
> - cmp $32, %ecx
> - jae L(more32bytes)
> - cmp $24, %ecx
> - je L(24bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $25, %ecx
> - je L(25bytes)
> - cmp $26, %ecx
> - je L(26bytes)
> - cmp $27, %ecx
> - je L(27bytes)
> - cmp $28, %ecx
> - je L(28bytes)
> - cmp $29, %ecx
> - je L(29bytes)
> - cmp $30, %ecx
> - je L(30bytes)
> - jmp L(31bytes)
> -# else
> - jmp L(28bytes)
> -# endif
> -
> - .p2align 4
> -L(more32bytes):
> - cmp $40, %ecx
> - jae L(more40bytes)
> - cmp $32, %ecx
> - je L(32bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $33, %ecx
> - je L(33bytes)
> - cmp $34, %ecx
> - je L(34bytes)
> - cmp $35, %ecx
> - je L(35bytes)
> - cmp $36, %ecx
> - je L(36bytes)
> - cmp $37, %ecx
> - je L(37bytes)
> - cmp $38, %ecx
> - je L(38bytes)
> - jmp L(39bytes)
> -# else
> - jmp L(36bytes)
> -# endif
> -
> - .p2align 4
> -L(more40bytes):
> - cmp $40, %ecx
> - je L(40bytes)
> -# ifndef USE_AS_WMEMCMP
> - cmp $41, %ecx
> - je L(41bytes)
> - cmp $42, %ecx
> - je L(42bytes)
> - cmp $43, %ecx
> - je L(43bytes)
> - cmp $44, %ecx
> - je L(44bytes)
> - cmp $45, %ecx
> - je L(45bytes)
> - cmp $46, %ecx
> - je L(46bytes)
> - jmp L(47bytes)
> -
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - movl -44(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - movl -40(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - movl -36(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - movl -32(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - movl -28(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - movl -24(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - movl -20(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - movl -16(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - movl -12(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - movl -8(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - movl -4(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# else
> - .p2align 4
> -L(44bytes):
> - movl -44(%rdi), %eax
> - cmp -44(%rsi), %eax
> - jne L(find_diff)
> -L(40bytes):
> - movl -40(%rdi), %eax
> - cmp -40(%rsi), %eax
> - jne L(find_diff)
> -L(36bytes):
> - movl -36(%rdi), %eax
> - cmp -36(%rsi), %eax
> - jne L(find_diff)
> -L(32bytes):
> - movl -32(%rdi), %eax
> - cmp -32(%rsi), %eax
> - jne L(find_diff)
> -L(28bytes):
> - movl -28(%rdi), %eax
> - cmp -28(%rsi), %eax
> - jne L(find_diff)
> -L(24bytes):
> - movl -24(%rdi), %eax
> - cmp -24(%rsi), %eax
> - jne L(find_diff)
> -L(20bytes):
> - movl -20(%rdi), %eax
> - cmp -20(%rsi), %eax
> - jne L(find_diff)
> -L(16bytes):
> - movl -16(%rdi), %eax
> - cmp -16(%rsi), %eax
> - jne L(find_diff)
> -L(12bytes):
> - movl -12(%rdi), %eax
> - cmp -12(%rsi), %eax
> - jne L(find_diff)
> -L(8bytes):
> - movl -8(%rdi), %eax
> - cmp -8(%rsi), %eax
> - jne L(find_diff)
> -L(4bytes):
> - movl -4(%rdi), %eax
> - cmp -4(%rsi), %eax
> - jne L(find_diff)
> -L(0bytes):
> - xor %eax, %eax
> - ret
> -# endif
> -
> -# ifndef USE_AS_WMEMCMP
> - .p2align 4
> -L(45bytes):
> - movl -45(%rdi), %eax
> - movl -45(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(41bytes):
> - movl -41(%rdi), %eax
> - movl -41(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(37bytes):
> - movl -37(%rdi), %eax
> - movl -37(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(33bytes):
> - movl -33(%rdi), %eax
> - movl -33(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(29bytes):
> - movl -29(%rdi), %eax
> - movl -29(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(25bytes):
> - movl -25(%rdi), %eax
> - movl -25(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(21bytes):
> - movl -21(%rdi), %eax
> - movl -21(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(17bytes):
> - movl -17(%rdi), %eax
> - movl -17(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(13bytes):
> - movl -13(%rdi), %eax
> - movl -13(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(9bytes):
> - movl -9(%rdi), %eax
> - movl -9(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(5bytes):
> - movl -5(%rdi), %eax
> - movl -5(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(1bytes):
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(46bytes):
> - movl -46(%rdi), %eax
> - movl -46(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(42bytes):
> - movl -42(%rdi), %eax
> - movl -42(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(38bytes):
> - movl -38(%rdi), %eax
> - movl -38(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(34bytes):
> - movl -34(%rdi), %eax
> - movl -34(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(30bytes):
> - movl -30(%rdi), %eax
> - movl -30(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(26bytes):
> - movl -26(%rdi), %eax
> - movl -26(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(22bytes):
> - movl -22(%rdi), %eax
> - movl -22(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(18bytes):
> - movl -18(%rdi), %eax
> - movl -18(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(14bytes):
> - movl -14(%rdi), %eax
> - movl -14(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(10bytes):
> - movl -10(%rdi), %eax
> - movl -10(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(6bytes):
> - movl -6(%rdi), %eax
> - movl -6(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(2bytes):
> - movzwl -2(%rdi), %eax
> - movzwl -2(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(47bytes):
> - movl -47(%rdi), %eax
> - movl -47(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(43bytes):
> - movl -43(%rdi), %eax
> - movl -43(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(39bytes):
> - movl -39(%rdi), %eax
> - movl -39(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(35bytes):
> - movl -35(%rdi), %eax
> - movl -35(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(31bytes):
> - movl -31(%rdi), %eax
> - movl -31(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(27bytes):
> - movl -27(%rdi), %eax
> - movl -27(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(23bytes):
> - movl -23(%rdi), %eax
> - movl -23(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(19bytes):
> - movl -19(%rdi), %eax
> - movl -19(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(15bytes):
> - movl -15(%rdi), %eax
> - movl -15(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(11bytes):
> - movl -11(%rdi), %eax
> - movl -11(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(7bytes):
> - movl -7(%rdi), %eax
> - movl -7(%rsi), %ecx
> - cmp %ecx, %eax
> - jne L(find_diff)
> -L(3bytes):
> - movzwl -3(%rdi), %eax
> - movzwl -3(%rsi), %ecx
> - cmpb %cl, %al
> - jne L(set)
> - cmp %ecx, %eax
> - jne L(set)
> - movzbl -1(%rdi), %eax
> - cmpb -1(%rsi), %al
> - jne L(set)
> - xor %eax, %eax
> - ret
> -
> - .p2align 4
> -L(find_diff):
> - cmpb %cl, %al
> - jne L(set)
> - cmpw %cx, %ax
> - jne L(set)
> - shr $16, %eax
> - shr $16, %ecx
> - cmpb %cl, %al
> - jne L(set)
> -
> -/* We get there only if we already know there is a
> -difference. */
> -
> - cmp %ecx, %eax
> -L(set):
> - sbb %eax, %eax
> - sbb $-1, %eax
> - ret
> -# else
> -
> -/* for wmemcmp */
> - .p2align 4
> -L(find_diff):
> - mov $1, %eax
> - jg L(find_diff_bigger)
> - neg %eax
> - ret
> -
> - .p2align 4
> -L(find_diff_bigger):
> - ret
> -# endif
> -
> - .p2align 4
> -L(equal):
> - xor %eax, %eax
> - ret
> -
> -END (MEMCMP)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> deleted file mode 100644
> index a41ef95fc1..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_ssse3
> -
> -#include "memcmp-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-14 18:05 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-04-14 18:05 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 --
> sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 -
> sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S | 6 -
> sysdeps/x86_64/multiarch/strcmp-ssse3.S | 5 -
> sysdeps/x86_64/multiarch/strcmp.c | 4 -
> sysdeps/x86_64/multiarch/strncase_l-ssse3.S | 6 -
> sysdeps/x86_64/multiarch/strncmp-ssse3.S | 28 ----
> sysdeps/x86_64/multiarch/strncmp.c | 4 -
> sysdeps/x86_64/strcmp.S | 155 ++++--------------
> 10 files changed, 30 insertions(+), 202 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 51222dfab1..ed2def288d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -58,7 +58,6 @@ sysdep_routines += \
> strcasecmp_l-evex \
> strcasecmp_l-sse2 \
> strcasecmp_l-sse4_2 \
> - strcasecmp_l-ssse3 \
> strcat-avx2 \
> strcat-avx2-rtm \
> strcat-evex \
> @@ -80,7 +79,6 @@ sysdep_routines += \
> strcmp-sse2 \
> strcmp-sse2-unaligned \
> strcmp-sse4_2 \
> - strcmp-ssse3 \
> strcpy-avx2 \
> strcpy-avx2-rtm \
> strcpy-evex \
> @@ -98,7 +96,6 @@ sysdep_routines += \
> strncase_l-evex \
> strncase_l-sse2 \
> strncase_l-sse4_2 \
> - strncase_l-ssse3 \
> strncat-avx2 \
> strncat-avx2-rtm \
> strncat-c \
> @@ -110,7 +107,6 @@ sysdep_routines += \
> strncmp-evex \
> strncmp-sse2 \
> strncmp-sse4_2 \
> - strncmp-ssse3 \
> strncpy-avx2 \
> strncpy-avx2-rtm \
> strncpy-c \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f389928a4e..7e2be3554b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strcasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strcasecmp,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strcasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>
> /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */
> @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strcasecmp_l_sse42)
> - IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strcasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
> __strcasecmp_l_sse2))
>
> @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strcmp_evex)
> IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
> __strcmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
> - __strcmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
>
> @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strncasecmp,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strncasecmp,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strncasecmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
> __strncasecmp_sse2))
>
> @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> CPU_FEATURE_USABLE (SSE4_2),
> __strncasecmp_l_sse42)
> - IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> - CPU_FEATURE_USABLE (SSSE3),
> - __strncasecmp_l_ssse3)
> IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
> __strncasecmp_l_sse2))
>
> @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> __strncmp_evex)
> IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
> __strncmp_sse42)
> - IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
> - __strncmp_ssse3)
> IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>
> #ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 766539c241..296d32071b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -20,7 +20,6 @@
> #include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
> && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> return OPTIMIZE (sse42);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> deleted file mode 100644
> index fb2f9ae14a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strcasecmp_l_ssse3
> -#define __strcasecmp __strcasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> deleted file mode 100644
> index 1b7fa33c91..0000000000
> --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> +++ /dev/null
> @@ -1,5 +0,0 @@
> -#if IS_IN (libc)
> -# define USE_SSSE3 1
> -# define STRCMP __strcmp_ssse3
> -# include "../strcmp.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> index 68cb73baad..a248c2a6e6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.c
> +++ b/sysdeps/x86_64/multiarch/strcmp.c
> @@ -28,7 +28,6 @@
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> return OPTIMIZE (sse2_unaligned);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
>
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> deleted file mode 100644
> index 6728678688..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strncasecmp_l_ssse3
> -#define __strncasecmp __strncasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> deleted file mode 100644
> index ec37308347..0000000000
> --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strcmp optimized with SSSE3.
> - Copyright (C) 2017-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#define STRCMP __strncmp_ssse3
> -
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strcmp)
> -
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCMP
> -#include <sysdeps/x86_64/strcmp.S>
> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> index fca74199d8..70ae6547c9 100644
> --- a/sysdeps/x86_64/multiarch/strncmp.c
> +++ b/sysdeps/x86_64/multiarch/strncmp.c
> @@ -27,7 +27,6 @@
> # include <init-arch.h>
>
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
> && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
> return OPTIMIZE (sse42);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index 99d8b36f1d..c38dc627f9 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -59,12 +59,7 @@
> # endif
> #endif
>
> -#ifndef USE_SSSE3
> .text
> -#else
> - .section .text.ssse3,"ax",@progbits
> -#endif
> -
> #ifdef USE_AS_STRCASECMP_L
> # ifndef ENTRY2
> # define ENTRY2(name) ENTRY (name)
> @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4 /* store for next cycle */
>
> -#ifndef USE_SSSE3
> psrldq $1, %xmm3
> pslldq $15, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4 /* store for next cycle */
>
> -#ifndef USE_SSSE3
> psrldq $1, %xmm3
> pslldq $15, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $1, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $2, %xmm3
> pslldq $14, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $2, %xmm3
> pslldq $14, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $2, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $3, %xmm3
> pslldq $13, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $3, %xmm3
> pslldq $13, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $3, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $4, %xmm3
> pslldq $12, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $4, %xmm3
> pslldq $12, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $4, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $5, %xmm3
> pslldq $11, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $5, %xmm3
> pslldq $11, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $5, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $6, %xmm3
> pslldq $10, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $6, %xmm3
> pslldq $10, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $6, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $7, %xmm3
> pslldq $9, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $7, %xmm3
> pslldq $9, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $7, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $8, %xmm3
> pslldq $8, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $8, %xmm3
> pslldq $8, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $8, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $9, %xmm3
> pslldq $7, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $9, %xmm3
> pslldq $7, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $9, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $10, %xmm3
> pslldq $6, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $10, %xmm3
> pslldq $6, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $10, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $11, %xmm3
> pslldq $5, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $11, %xmm3
> pslldq $5, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $11, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $12, %xmm3
> pslldq $4, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $12, %xmm3
> pslldq $4, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $12, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $13, %xmm3
> pslldq $3, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $13, %xmm3
> pslldq $3, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $13, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $14, %xmm3
> pslldq $2, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $14, %xmm3
> pslldq $2, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $14, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $15, %xmm3
> pslldq $1, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
> movdqa (%rdi, %rcx), %xmm2
> movdqa %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
> psrldq $15, %xmm3
> pslldq $1, %xmm2
> por %xmm3, %xmm2 /* merge into one 16byte value */
> -#else
> - palignr $15, %xmm3, %xmm2 /* merge into one 16byte value */
> -#endif
> +
> TOLOWER (%xmm1, %xmm2)
>
> pcmpeqb %xmm1, %xmm0
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 3/6] x86: Remove str{n}cat-ssse3
2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-14 18:06 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-04-14 18:06 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
> sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 -
> sysdeps/x86_64/multiarch/strcat-ssse3.S | 866 ---------------------
> sysdeps/x86_64/multiarch/strncat-ssse3.S | 3 -
> 5 files changed, 879 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ed2def288d..2b3c625ea2 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -63,7 +63,6 @@ sysdep_routines += \
> strcat-evex \
> strcat-sse2 \
> strcat-sse2-unaligned \
> - strcat-ssse3 \
> strchr-avx2 \
> strchr-avx2-rtm \
> strchr-evex \
> @@ -101,7 +100,6 @@ sysdep_routines += \
> strncat-c \
> strncat-evex \
> strncat-sse2-unaligned \
> - strncat-ssse3 \
> strncmp-avx2 \
> strncmp-avx2-rtm \
> strncmp-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7e2be3554b..41a04621ad 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcat_evex)
> - IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
> - __strcat_ssse3)
> IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
>
> @@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncat_evex)
> - IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
> - __strncat_ssse3)
> IFUNC_IMPL_ADD (array, i, strncat, 1,
> __strncat_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 5bece38f78..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -23,7 +23,6 @@
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
> if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
> return OPTIMIZE (sse2_unaligned);
>
> - if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> - return OPTIMIZE (ssse3);
> -
> return OPTIMIZE (sse2);
> }
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index 9f39e4fcd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,866 +0,0 @@
> -/* strcat with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -# define STRCAT __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> - mov %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> - implementation gets merged. */
> -
> - xor %eax, %eax
> - cmpb $0, (%rdi)
> - jz L(exit_tail0)
> - cmpb $0, 1(%rdi)
> - jz L(exit_tail1)
> - cmpb $0, 2(%rdi)
> - jz L(exit_tail2)
> - cmpb $0, 3(%rdi)
> - jz L(exit_tail3)
> -
> - cmpb $0, 4(%rdi)
> - jz L(exit_tail4)
> - cmpb $0, 5(%rdi)
> - jz L(exit_tail5)
> - cmpb $0, 6(%rdi)
> - jz L(exit_tail6)
> - cmpb $0, 7(%rdi)
> - jz L(exit_tail7)
> -
> - cmpb $0, 8(%rdi)
> - jz L(exit_tail8)
> - cmpb $0, 9(%rdi)
> - jz L(exit_tail9)
> - cmpb $0, 10(%rdi)
> - jz L(exit_tail10)
> - cmpb $0, 11(%rdi)
> - jz L(exit_tail11)
> -
> - cmpb $0, 12(%rdi)
> - jz L(exit_tail12)
> - cmpb $0, 13(%rdi)
> - jz L(exit_tail13)
> - cmpb $0, 14(%rdi)
> - jz L(exit_tail14)
> - cmpb $0, 15(%rdi)
> - jz L(exit_tail15)
> - pxor %xmm0, %xmm0
> - lea 16(%rdi), %rcx
> - lea 16(%rdi), %rax
> - and $-16, %rax
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - pxor %xmm1, %xmm1
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - pxor %xmm2, %xmm2
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - pxor %xmm3, %xmm3
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm0
> - pmovmskb %xmm0, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm1
> - pmovmskb %xmm1, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm2
> - pmovmskb %xmm2, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - pcmpeqb (%rax), %xmm3
> - pmovmskb %xmm3, %edx
> - test %edx, %edx
> - lea 16(%rax), %rax
> - jnz L(exit)
> -
> - and $-0x40, %rax
> -
> - .p2align 4
> -L(aligned_64):
> - pcmpeqb (%rax), %xmm0
> - pcmpeqb 16(%rax), %xmm1
> - pcmpeqb 32(%rax), %xmm2
> - pcmpeqb 48(%rax), %xmm3
> - pmovmskb %xmm0, %edx
> - pmovmskb %xmm1, %r11d
> - pmovmskb %xmm2, %r10d
> - pmovmskb %xmm3, %r9d
> - or %edx, %r9d
> - or %r11d, %r9d
> - or %r10d, %r9d
> - lea 64(%rax), %rax
> - jz L(aligned_64)
> -
> - test %edx, %edx
> - jnz L(aligned_64_exit_16)
> - test %r11d, %r11d
> - jnz L(aligned_64_exit_32)
> - test %r10d, %r10d
> - jnz L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> - pmovmskb %xmm3, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_48):
> - lea -16(%rax), %rax
> - mov %r10d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_32):
> - lea -32(%rax), %rax
> - mov %r11d, %edx
> - jmp L(exit)
> -
> -L(aligned_64_exit_16):
> - lea -48(%rax), %rax
> -
> -L(exit):
> - sub %rcx, %rax
> - test %dl, %dl
> - jz L(exit_high)
> - test $0x01, %dl
> - jnz L(exit_tail0)
> -
> - test $0x02, %dl
> - jnz L(exit_tail1)
> -
> - test $0x04, %dl
> - jnz L(exit_tail2)
> -
> - test $0x08, %dl
> - jnz L(exit_tail3)
> -
> - test $0x10, %dl
> - jnz L(exit_tail4)
> -
> - test $0x20, %dl
> - jnz L(exit_tail5)
> -
> - test $0x40, %dl
> - jnz L(exit_tail6)
> - add $7, %eax
> -L(exit_tail0):
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_high):
> - add $8, %eax
> - test $0x01, %dh
> - jnz L(exit_tail0)
> -
> - test $0x02, %dh
> - jnz L(exit_tail1)
> -
> - test $0x04, %dh
> - jnz L(exit_tail2)
> -
> - test $0x08, %dh
> - jnz L(exit_tail3)
> -
> - test $0x10, %dh
> - jnz L(exit_tail4)
> -
> - test $0x20, %dh
> - jnz L(exit_tail5)
> -
> - test $0x40, %dh
> - jnz L(exit_tail6)
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail1):
> - add $1, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail2):
> - add $2, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail3):
> - add $3, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail4):
> - add $4, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail5):
> - add $5, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail6):
> - add $6, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail7):
> - add $7, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail8):
> - add $8, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail9):
> - add $9, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail10):
> - add $10, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail11):
> - add $11, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail12):
> - add $12, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail13):
> - add $13, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail14):
> - add $14, %eax
> - jmp L(StartStrcpyPart)
> -
> - .p2align 4
> -L(exit_tail15):
> - add $15, %eax
> -
> - .p2align 4
> -L(StartStrcpyPart):
> - mov %rsi, %rcx
> - lea (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> - test %r8, %r8
> - jz L(StrncatExit0)
> - cmp $8, %r8
> - jbe L(StrncatExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - jb L(StrncatExit15Bytes)
> -# endif
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# ifdef USE_AS_STRNCAT
> - cmp $16, %r8
> - je L(StrncatExit16)
> -# define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit1):
> - xor %ah, %ah
> - movb %ah, 1(%rdx)
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit2):
> - xor %ah, %ah
> - movb %ah, 2(%rdx)
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit3):
> - xor %ah, %ah
> - movb %ah, 3(%rdx)
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit4):
> - xor %ah, %ah
> - movb %ah, 4(%rdx)
> -L(Exit4):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit5):
> - xor %ah, %ah
> - movb %ah, 5(%rdx)
> -L(Exit5):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit6):
> - xor %ah, %ah
> - movb %ah, 6(%rdx)
> -L(Exit6):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit7):
> - xor %ah, %ah
> - movb %ah, 7(%rdx)
> -L(Exit7):
> - mov (%rcx), %eax
> - mov %eax, (%rdx)
> - mov 3(%rcx), %eax
> - mov %eax, 3(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8):
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> -L(Exit8):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit9):
> - xor %ah, %ah
> - movb %ah, 9(%rdx)
> -L(Exit9):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movb 8(%rcx), %al
> - movb %al, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit10):
> - xor %ah, %ah
> - movb %ah, 10(%rdx)
> -L(Exit10):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movw 8(%rcx), %ax
> - movw %ax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit11):
> - xor %ah, %ah
> - movb %ah, 11(%rdx)
> -L(Exit11):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit12):
> - xor %ah, %ah
> - movb %ah, 12(%rdx)
> -L(Exit12):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit13):
> - xor %ah, %ah
> - movb %ah, 13(%rdx)
> -L(Exit13):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 5(%rcx), %xmm1
> - movlpd %xmm1, 5(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit14):
> - xor %ah, %ah
> - movb %ah, 14(%rdx)
> -L(Exit14):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 6(%rcx), %xmm1
> - movlpd %xmm1, 6(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15):
> - xor %ah, %ah
> - movb %ah, 15(%rdx)
> -L(Exit15):
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit16):
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> -L(Exit16):
> - movlpd (%rcx), %xmm0
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm0, (%rdx)
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - test $0x40, %al
> - jnz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase2):
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $9, %r8
> - je L(StrncatExit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $8, %r8
> - ja L(ExitHighCase3)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - xor %ah, %ah
> - movb %ah, 8(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(ExitHighCase3):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - cmp $15, %r8
> - je L(StrncatExit15)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 8(%rcx), %xmm1
> - movlpd %xmm1, 8(%rdx)
> - xor %ah, %ah
> - movb %ah, 16(%rdx)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit0):
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit15Bytes):
> - cmp $9, %r8
> - je L(StrncatExit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $10, %r8
> - je L(StrncatExit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $11, %r8
> - je L(StrncatExit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $12, %r8
> - je L(StrncatExit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $13, %r8
> - je L(StrncatExit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmp $14, %r8
> - je L(StrncatExit14)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - movlpd 7(%rcx), %xmm1
> - movlpd %xmm1, 7(%rdx)
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> - .p2align 4
> -L(StrncatExit8Bytes):
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $1, %r8
> - je L(StrncatExit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $2, %r8
> - je L(StrncatExit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $3, %r8
> - je L(StrncatExit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $4, %r8
> - je L(StrncatExit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $5, %r8
> - je L(StrncatExit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $6, %r8
> - je L(StrncatExit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmp $7, %r8
> - je L(StrncatExit7)
> - movlpd (%rcx), %xmm0
> - movlpd %xmm0, (%rdx)
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> - xor %cl, %cl
> - movb %cl, (%rax)
> - mov %rdi, %rax
> - ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 6c45ff3ec7..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3
2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-14 18:10 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-04-14 18:10 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 4 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 8 -
> sysdeps/x86_64/multiarch/stpcpy-ssse3.S | 3 -
> sysdeps/x86_64/multiarch/stpncpy-ssse3.S | 4 -
> sysdeps/x86_64/multiarch/strcpy-ssse3.S | 3550 --------------------
> sysdeps/x86_64/multiarch/strncpy-ssse3.S | 3 -
> 6 files changed, 3572 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 2b3c625ea2..5b02ec8de5 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -46,13 +46,11 @@ sysdep_routines += \
> stpcpy-evex \
> stpcpy-sse2 \
> stpcpy-sse2-unaligned \
> - stpcpy-ssse3 \
> stpncpy-avx2 \
> stpncpy-avx2-rtm \
> stpncpy-c \
> stpncpy-evex \
> stpncpy-sse2-unaligned \
> - stpncpy-ssse3 \
> strcasecmp_l-avx2 \
> strcasecmp_l-avx2-rtm \
> strcasecmp_l-evex \
> @@ -83,7 +81,6 @@ sysdep_routines += \
> strcpy-evex \
> strcpy-sse2 \
> strcpy-sse2-unaligned \
> - strcpy-ssse3 \
> strcspn-c \
> strcspn-sse2 \
> strlen-avx2 \
> @@ -110,7 +107,6 @@ sysdep_routines += \
> strncpy-c \
> strncpy-evex \
> strncpy-sse2-unaligned \
> - strncpy-ssse3 \
> strnlen-avx2 \
> strnlen-avx2-rtm \
> strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 41a04621ad..49ce6860d0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpncpy.c. */
> IFUNC_IMPL (i, name, stpncpy,
> - IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
> __stpncpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
> /* Support sysdeps/x86_64/multiarch/stpcpy.c. */
> IFUNC_IMPL (i, name, stpcpy,
> - IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> - __stpcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
> __stpcpy_avx2)
> IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strcpy_evex)
> - IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> - __strcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> (CPU_FEATURE_USABLE (AVX512VL)
> && CPU_FEATURE_USABLE (AVX512BW)),
> __strncpy_evex)
> - IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> - __strncpy_ssse3)
> IFUNC_IMPL_ADD (array, i, strncpy, 1,
> __strncpy_sse2_unaligned)
> IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> - Copyright (C) 2011-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -# include <sysdep.h>
> -
> -# ifndef STRCPY
> -# define STRCPY __strcpy_ssse3
> -# endif
> -
> - .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> - mov %rsi, %rcx
> -# ifdef USE_AS_STRNCPY
> - mov %RDX_LP, %R8_LP
> -# endif
> - mov %rdi, %rdx
> -# ifdef USE_AS_STRNCPY
> - test %R8_LP, %R8_LP
> - jz L(Exit0)
> - cmp $8, %R8_LP
> - jbe L(StrncpyExit8Bytes)
> -# endif
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - cmpb $0, 7(%rcx)
> - jz L(Exit8)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - jb L(StrncpyExit15Bytes)
> -# endif
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - cmpb $0, 14(%rcx)
> - jz L(Exit15)
> -# ifdef USE_AS_STRNCPY
> - cmp $16, %r8
> - je L(Exit16)
> -# endif
> - cmpb $0, 15(%rcx)
> - jz L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - mov %rcx, %rsi
> - sub $16, %r8
> - and $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> - add %rsi, %r8
> -# endif
> - lea 16(%rcx), %rsi
> - and $-16, %rsi
> - pxor %xmm0, %xmm0
> - mov (%rcx), %r9
> - mov %r9, (%rdx)
> - pcmpeqb (%rsi), %xmm0
> - mov 8(%rcx), %r9
> - mov %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> - pmovmskb %xmm0, %rax
> - sub %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - mov %rdx, %rax
> - lea 16(%rdx), %rdx
> - and $-16, %rdx
> - sub %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> - add %rax, %rsi
> - lea -1(%rsi), %rsi
> - and $1<<31, %esi
> - test %rsi, %rsi
> - jnz L(ContinueCopy)
> - lea 16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> - sub %rax, %rcx
> - mov %rcx, %rax
> - and $0xf, %rax
> - mov $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> - jz L(Align16Both)
> -
> - cmp $8, %rax
> - jae L(ShlHigh8)
> - cmp $1, %rax
> - je L(Shl1)
> - cmp $2, %rax
> - je L(Shl2)
> - cmp $3, %rax
> - je L(Shl3)
> - cmp $4, %rax
> - je L(Shl4)
> - cmp $5, %rax
> - je L(Shl5)
> - cmp $6, %rax
> - je L(Shl6)
> - jmp L(Shl7)
> -
> -L(ShlHigh8):
> - je L(Shl8)
> - cmp $9, %rax
> - je L(Shl9)
> - cmp $10, %rax
> - je L(Shl10)
> - cmp $11, %rax
> - je L(Shl11)
> - cmp $12, %rax
> - je L(Shl12)
> - cmp $13, %rax
> - je L(Shl13)
> - cmp $14, %rax
> - je L(Shl14)
> - jmp L(Shl15)
> -
> -L(Align16Both):
> - movaps (%rcx), %xmm1
> - movaps 16(%rcx), %xmm2
> - movaps %xmm1, (%rdx)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm4
> - movaps %xmm3, (%rdx, %rsi)
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm1
> - movaps %xmm4, (%rdx, %rsi)
> - pcmpeqb %xmm1, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm2
> - movaps %xmm1, (%rdx, %rsi)
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps 16(%rcx, %rsi), %xmm3
> - movaps %xmm2, (%rdx, %rsi)
> - pcmpeqb %xmm3, %xmm0
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm3, (%rdx, %rsi)
> - mov %rcx, %rax
> - lea 16(%rcx, %rsi), %rcx
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - lea 112(%r8, %rax), %r8
> -# endif
> - mov $-0x40, %rsi
> -
> - .p2align 4
> -L(Aligned64Loop):
> - movaps (%rcx), %xmm2
> - movaps %xmm2, %xmm4
> - movaps 16(%rcx), %xmm5
> - movaps 32(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 48(%rcx), %xmm7
> - pminub %xmm5, %xmm2
> - pminub %xmm7, %xmm3
> - pminub %xmm2, %xmm3
> - pcmpeqb %xmm0, %xmm3
> - pmovmskb %xmm3, %rax
> - lea 64(%rdx), %rdx
> - lea 64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeaveCase2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Aligned64Leave)
> - movaps %xmm4, -64(%rdx)
> - movaps %xmm5, -48(%rdx)
> - movaps %xmm6, -32(%rdx)
> - movaps %xmm7, -16(%rdx)
> - jmp L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> - lea 48(%r8), %r8
> -# endif
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - test %rax, %rax
> - lea 16(%rsi), %rsi
> - jnz L(CopyFrom1To16Bytes)
> -
> - movaps %xmm6, -32(%rdx)
> - pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> - lea -16(%r8), %r8
> -# endif
> - pmovmskb %xmm0, %rax
> - lea 16(%rsi), %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl1):
> - movaps -1(%rcx), %xmm1
> - movaps 15(%rcx), %xmm2
> -L(Shl1Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit1Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl1LoopExit)
> -
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 31(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -15(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl1LoopStart):
> - movaps 15(%rcx), %xmm2
> - movaps 31(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 47(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 63(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $1, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $1, %xmm3, %xmm4
> - jnz L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave1)
> -# endif
> - palignr $1, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> - movdqu -1(%rcx), %xmm1
> - mov $15, %rsi
> - movdqu %xmm1, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl2):
> - movaps -2(%rcx), %xmm1
> - movaps 14(%rcx), %xmm2
> -L(Shl2Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit2Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl2LoopExit)
> -
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 30(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -14(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl2LoopStart):
> - movaps 14(%rcx), %xmm2
> - movaps 30(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 46(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 62(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $2, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $2, %xmm3, %xmm4
> - jnz L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave2)
> -# endif
> - palignr $2, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> - movdqu -2(%rcx), %xmm1
> - mov $14, %rsi
> - movdqu %xmm1, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl3):
> - movaps -3(%rcx), %xmm1
> - movaps 13(%rcx), %xmm2
> -L(Shl3Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit3Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl3LoopExit)
> -
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 29(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -13(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl3LoopStart):
> - movaps 13(%rcx), %xmm2
> - movaps 29(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 45(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 61(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $3, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $3, %xmm3, %xmm4
> - jnz L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave3)
> -# endif
> - palignr $3, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> - movdqu -3(%rcx), %xmm1
> - mov $13, %rsi
> - movdqu %xmm1, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl4):
> - movaps -4(%rcx), %xmm1
> - movaps 12(%rcx), %xmm2
> -L(Shl4Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit4Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl4LoopExit)
> -
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 28(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -12(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl4LoopStart):
> - movaps 12(%rcx), %xmm2
> - movaps 28(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 44(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 60(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $4, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $4, %xmm3, %xmm4
> - jnz L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave4)
> -# endif
> - palignr $4, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> - movdqu -4(%rcx), %xmm1
> - mov $12, %rsi
> - movdqu %xmm1, -4(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl5):
> - movaps -5(%rcx), %xmm1
> - movaps 11(%rcx), %xmm2
> -L(Shl5Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit5Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl5LoopExit)
> -
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 27(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -11(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl5LoopStart):
> - movaps 11(%rcx), %xmm2
> - movaps 27(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 43(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 59(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $5, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $5, %xmm3, %xmm4
> - jnz L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave5)
> -# endif
> - palignr $5, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> - movdqu -5(%rcx), %xmm1
> - mov $11, %rsi
> - movdqu %xmm1, -5(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl6):
> - movaps -6(%rcx), %xmm1
> - movaps 10(%rcx), %xmm2
> -L(Shl6Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit6Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl6LoopExit)
> -
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 26(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -10(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl6LoopStart):
> - movaps 10(%rcx), %xmm2
> - movaps 26(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 42(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 58(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $6, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $6, %xmm3, %xmm4
> - jnz L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave6)
> -# endif
> - palignr $6, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> - mov (%rcx), %r9
> - mov 6(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 6(%rdx)
> - mov $10, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl7):
> - movaps -7(%rcx), %xmm1
> - movaps 9(%rcx), %xmm2
> -L(Shl7Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit7Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl7LoopExit)
> -
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 25(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -9(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl7LoopStart):
> - movaps 9(%rcx), %xmm2
> - movaps 25(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 41(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 57(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $7, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $7, %xmm3, %xmm4
> - jnz L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave7)
> -# endif
> - palignr $7, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> - mov (%rcx), %r9
> - mov 5(%rcx), %esi
> - mov %r9, (%rdx)
> - mov %esi, 5(%rdx)
> - mov $9, %rsi
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl8):
> - movaps -8(%rcx), %xmm1
> - movaps 8(%rcx), %xmm2
> -L(Shl8Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit8Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl8LoopExit)
> -
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 24(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -8(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl8LoopStart):
> - movaps 8(%rcx), %xmm2
> - movaps 24(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 40(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 56(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $8, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $8, %xmm3, %xmm4
> - jnz L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave8)
> -# endif
> - palignr $8, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl9):
> - movaps -9(%rcx), %xmm1
> - movaps 7(%rcx), %xmm2
> -L(Shl9Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit9Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl9LoopExit)
> -
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 23(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -7(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl9LoopStart):
> - movaps 7(%rcx), %xmm2
> - movaps 23(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 39(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 55(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $9, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $9, %xmm3, %xmm4
> - jnz L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave9)
> -# endif
> - palignr $9, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl10):
> - movaps -10(%rcx), %xmm1
> - movaps 6(%rcx), %xmm2
> -L(Shl10Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit10Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl10LoopExit)
> -
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 22(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -6(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl10LoopStart):
> - movaps 6(%rcx), %xmm2
> - movaps 22(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 38(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 54(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $10, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $10, %xmm3, %xmm4
> - jnz L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave10)
> -# endif
> - palignr $10, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl11):
> - movaps -11(%rcx), %xmm1
> - movaps 5(%rcx), %xmm2
> -L(Shl11Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit11Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl11LoopExit)
> -
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 21(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -5(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl11LoopStart):
> - movaps 5(%rcx), %xmm2
> - movaps 21(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 37(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 53(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $11, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $11, %xmm3, %xmm4
> - jnz L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave11)
> -# endif
> - palignr $11, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl12):
> - movaps -12(%rcx), %xmm1
> - movaps 4(%rcx), %xmm2
> -L(Shl12Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit12Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl12LoopExit)
> -
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 20(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -4(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl12LoopStart):
> - movaps 4(%rcx), %xmm2
> - movaps 20(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 36(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 52(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $12, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $12, %xmm3, %xmm4
> - jnz L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave12)
> -# endif
> - palignr $12, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl13):
> - movaps -13(%rcx), %xmm1
> - movaps 3(%rcx), %xmm2
> -L(Shl13Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit13Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl13LoopExit)
> -
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 19(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -3(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl13LoopStart):
> - movaps 3(%rcx), %xmm2
> - movaps 19(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 35(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 51(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $13, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $13, %xmm3, %xmm4
> - jnz L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave13)
> -# endif
> - palignr $13, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl14):
> - movaps -14(%rcx), %xmm1
> - movaps 2(%rcx), %xmm2
> -L(Shl14Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit14Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl14LoopExit)
> -
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 18(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -2(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl14LoopStart):
> - movaps 2(%rcx), %xmm2
> - movaps 18(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 34(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 50(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $14, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $14, %xmm3, %xmm4
> - jnz L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave14)
> -# endif
> - palignr $14, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - jmp L(CopyFrom1To16Bytes)
> -
> - .p2align 4
> -L(Shl15):
> - movaps -15(%rcx), %xmm1
> - movaps 1(%rcx), %xmm2
> -L(Shl15Start):
> - pcmpeqb %xmm2, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> - movaps %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> -
> - pcmpeqb %xmm2, %xmm0
> - lea 16(%rdx), %rdx
> - pmovmskb %xmm0, %rax
> - lea 16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - jbe L(StrncpyExit15Case2OrCase3)
> -# endif
> - test %rax, %rax
> - jnz L(Shl15LoopExit)
> -
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, (%rdx)
> - lea 17(%rcx), %rcx
> - lea 16(%rdx), %rdx
> -
> - mov %rcx, %rax
> - and $-0x40, %rcx
> - sub %rcx, %rax
> - lea -1(%rcx), %rcx
> - sub %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> - add %rax, %r8
> -# endif
> - movaps -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> - .p2align 4
> -L(Shl15LoopStart):
> - movaps 1(%rcx), %xmm2
> - movaps 17(%rcx), %xmm3
> - movaps %xmm3, %xmm6
> - movaps 33(%rcx), %xmm4
> - movaps %xmm4, %xmm7
> - movaps 49(%rcx), %xmm5
> - pminub %xmm2, %xmm6
> - pminub %xmm5, %xmm7
> - pminub %xmm6, %xmm7
> - pcmpeqb %xmm0, %xmm7
> - pmovmskb %xmm7, %rax
> - movaps %xmm5, %xmm7
> - palignr $15, %xmm4, %xmm5
> - test %rax, %rax
> - palignr $15, %xmm3, %xmm4
> - jnz L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> - sub $64, %r8
> - jbe L(StrncpyLeave15)
> -# endif
> - palignr $15, %xmm2, %xmm3
> - lea 64(%rcx), %rcx
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm7, %xmm1
> - movaps %xmm5, 48(%rdx)
> - movaps %xmm4, 32(%rdx)
> - movaps %xmm3, 16(%rdx)
> - movaps %xmm2, (%rdx)
> - lea 64(%rdx), %rdx
> - jmp L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> - jmp L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> - .p2align 4
> -L(CopyFrom1To16Bytes):
> -# ifdef USE_AS_STRNCPY
> - add $16, %r8
> -# endif
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - test %al, %al
> - jz L(ExitHigh)
> - test $0x01, %al
> - jnz L(Exit1)
> - test $0x02, %al
> - jnz L(Exit2)
> - test $0x04, %al
> - jnz L(Exit3)
> - test $0x08, %al
> - jnz L(Exit4)
> - test $0x10, %al
> - jnz L(Exit5)
> - test $0x20, %al
> - jnz L(Exit6)
> - test $0x40, %al
> - jnz L(Exit7)
> -
> - .p2align 4
> -L(Exit8):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $8, %r8
> - lea 8(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(ExitHigh):
> - test $0x01, %ah
> - jnz L(Exit9)
> - test $0x02, %ah
> - jnz L(Exit10)
> - test $0x04, %ah
> - jnz L(Exit11)
> - test $0x08, %ah
> - jnz L(Exit12)
> - test $0x10, %ah
> - jnz L(Exit13)
> - test $0x20, %ah
> - jnz L(Exit14)
> - test $0x40, %ah
> - jnz L(Exit15)
> -
> - .p2align 4
> -L(Exit16):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %rax
> - mov %rax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 15(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $16, %r8
> - lea 16(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase2):
> - add $16, %r8
> - add %rsi, %rcx
> - lea (%rsi, %rdx), %rsi
> - lea -9(%r8), %rdx
> - and $1<<7, %dh
> - or %al, %dh
> - test %dh, %dh
> - lea (%rsi), %rdx
> - jz L(ExitHighCase2)
> -
> - cmp $1, %r8
> - je L(Exit1)
> - test $0x01, %al
> - jnz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - test $0x02, %al
> - jnz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - test $0x04, %al
> - jnz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - test $0x08, %al
> - jnz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - test $0x10, %al
> - jnz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - test $0x20, %al
> - jnz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - test $0x40, %al
> - jnz L(Exit7)
> - jmp L(Exit8)
> -
> - .p2align 4
> -L(ExitHighCase2):
> - cmp $9, %r8
> - je L(Exit9)
> - test $0x01, %ah
> - jnz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - test $0x02, %ah
> - jnz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - test $0x04, %ah
> - jnz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - test $0x8, %ah
> - jnz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - test $0x10, %ah
> - jnz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - test $0x20, %ah
> - jnz L(Exit14)
> - cmp $15, %r8
> - je L(Exit15)
> - test $0x40, %ah
> - jnz L(Exit15)
> - jmp L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> -
> - .p2align 4
> -L(CopyFrom1To16BytesCase3):
> - add $16, %r8
> - add %rsi, %rdx
> - add %rsi, %rcx
> -
> - cmp $16, %r8
> - je L(Exit16)
> - cmp $8, %r8
> - je L(Exit8)
> - jg L(More8Case3)
> - cmp $4, %r8
> - je L(Exit4)
> - jg L(More4Case3)
> - cmp $2, %r8
> - jl L(Exit1)
> - je L(Exit2)
> - jg L(Exit3)
> -L(More8Case3): /* but less than 16 */
> - cmp $12, %r8
> - je L(Exit12)
> - jl L(Less12Case3)
> - cmp $14, %r8
> - jl L(Exit13)
> - je L(Exit14)
> - jg L(Exit15)
> -L(More4Case3): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Exit5)
> - je L(Exit6)
> - jg L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Exit9)
> - je L(Exit10)
> - jg L(Exit11)
> -# endif
> -
> - .p2align 4
> -L(Exit1):
> - movb (%rcx), %al
> - movb %al, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea (%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $1, %r8
> - lea 1(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit2):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 1(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $2, %r8
> - lea 2(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit3):
> - movw (%rcx), %ax
> - movw %ax, (%rdx)
> - movb 2(%rcx), %al
> - movb %al, 2(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 2(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $3, %r8
> - lea 3(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit4):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 3(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $4, %r8
> - lea 4(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit5):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movb 4(%rcx), %al
> - movb %al, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 4(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $5, %r8
> - lea 5(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit6):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movw 4(%rcx), %ax
> - movw %ax, 4(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 5(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $6, %r8
> - lea 6(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit7):
> - movl (%rcx), %eax
> - movl %eax, (%rdx)
> - movl 3(%rcx), %eax
> - movl %eax, 3(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 6(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $7, %r8
> - lea 7(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit9):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %eax
> - mov %eax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 8(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $9, %r8
> - lea 9(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit10):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %eax
> - mov %eax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 9(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $10, %r8
> - lea 10(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit11):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %eax
> - mov %eax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 10(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $11, %r8
> - lea 11(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit12):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 8(%rcx), %eax
> - mov %eax, 8(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 11(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $12, %r8
> - lea 12(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit13):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 5(%rcx), %rax
> - mov %rax, 5(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 12(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $13, %r8
> - lea 13(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit14):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 6(%rcx), %rax
> - mov %rax, 6(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 13(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $14, %r8
> - lea 14(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> - .p2align 4
> -L(Exit15):
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> -# else
> - mov %rdi, %rax
> -# endif
> -# ifdef USE_AS_STRNCPY
> - sub $15, %r8
> - lea 15(%rdx), %rcx
> - jnz L(StrncpyFillTailWithZero1)
> -# ifdef USE_AS_STPCPY
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# endif
> -# endif
> - ret
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(Fill0):
> - ret
> -
> - .p2align 4
> -L(Fill1):
> - movb %dl, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill2):
> - movw %dx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill3):
> - movw %dx, (%rcx)
> - movb %dl, 2(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill4):
> - movl %edx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill5):
> - movl %edx, (%rcx)
> - movb %dl, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill6):
> - movl %edx, (%rcx)
> - movw %dx, 4(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill7):
> - movl %edx, (%rcx)
> - movl %edx, 3(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill8):
> - mov %rdx, (%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill9):
> - mov %rdx, (%rcx)
> - movb %dl, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill10):
> - mov %rdx, (%rcx)
> - movw %dx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill11):
> - mov %rdx, (%rcx)
> - movl %edx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill12):
> - mov %rdx, (%rcx)
> - movl %edx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill13):
> - mov %rdx, (%rcx)
> - mov %rdx, 5(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill14):
> - mov %rdx, (%rcx)
> - mov %rdx, 6(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill15):
> - mov %rdx, (%rcx)
> - mov %rdx, 7(%rcx)
> - ret
> -
> - .p2align 4
> -L(Fill16):
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> - ret
> -
> - .p2align 4
> -L(StrncpyFillExit1):
> - lea 16(%r8), %r8
> -L(FillFrom1To16Bytes):
> - test %r8, %r8
> - jz L(Fill0)
> - cmp $16, %r8
> - je L(Fill16)
> - cmp $8, %r8
> - je L(Fill8)
> - jg L(FillMore8)
> - cmp $4, %r8
> - je L(Fill4)
> - jg L(FillMore4)
> - cmp $2, %r8
> - jl L(Fill1)
> - je L(Fill2)
> - jg L(Fill3)
> -L(FillMore8): /* but less than 16 */
> - cmp $12, %r8
> - je L(Fill12)
> - jl L(FillLess12)
> - cmp $14, %r8
> - jl L(Fill13)
> - je L(Fill14)
> - jg L(Fill15)
> -L(FillMore4): /* but less than 8 */
> - cmp $6, %r8
> - jl L(Fill5)
> - je L(Fill6)
> - jg L(Fill7)
> -L(FillLess12): /* but more than 8 */
> - cmp $10, %r8
> - jl L(Fill9)
> - je L(Fill10)
> - jmp L(Fill11)
> -
> - .p2align 4
> -L(StrncpyFillTailWithZero1):
> - xor %rdx, %rdx
> - sub $16, %r8
> - jbe L(StrncpyFillExit1)
> -
> - pxor %xmm0, %xmm0
> - mov %rdx, (%rcx)
> - mov %rdx, 8(%rcx)
> -
> - lea 16(%rcx), %rcx
> -
> - mov %rcx, %rdx
> - and $0xf, %rdx
> - sub %rdx, %rcx
> - add %rdx, %r8
> - xor %rdx, %rdx
> - sub $64, %r8
> - jb L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - movdqa %xmm0, 32(%rcx)
> - movdqa %xmm0, 48(%rcx)
> - lea 64(%rcx), %rcx
> - sub $64, %r8
> - jae L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> - add $32, %r8
> - jl L(StrncpyFillLess32)
> - movdqa %xmm0, (%rcx)
> - movdqa %xmm0, 16(%rcx)
> - lea 32(%rcx), %rcx
> - sub $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> - add $16, %r8
> - jl L(StrncpyFillExit1)
> - movdqa %xmm0, (%rcx)
> - lea 16(%rcx), %rcx
> - jmp L(FillFrom1To16Bytes)
> -
> - .p2align 4
> -L(Exit0):
> - mov %rdx, %rax
> - ret
> -
> - .p2align 4
> -L(StrncpyExit15Bytes):
> - cmp $9, %r8
> - je L(Exit9)
> - cmpb $0, 8(%rcx)
> - jz L(Exit9)
> - cmp $10, %r8
> - je L(Exit10)
> - cmpb $0, 9(%rcx)
> - jz L(Exit10)
> - cmp $11, %r8
> - je L(Exit11)
> - cmpb $0, 10(%rcx)
> - jz L(Exit11)
> - cmp $12, %r8
> - je L(Exit12)
> - cmpb $0, 11(%rcx)
> - jz L(Exit12)
> - cmp $13, %r8
> - je L(Exit13)
> - cmpb $0, 12(%rcx)
> - jz L(Exit13)
> - cmp $14, %r8
> - je L(Exit14)
> - cmpb $0, 13(%rcx)
> - jz L(Exit14)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> - mov 7(%rcx), %rax
> - mov %rax, 7(%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 14(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> - .p2align 4
> -L(StrncpyExit8Bytes):
> - cmp $1, %r8
> - je L(Exit1)
> - cmpb $0, (%rcx)
> - jz L(Exit1)
> - cmp $2, %r8
> - je L(Exit2)
> - cmpb $0, 1(%rcx)
> - jz L(Exit2)
> - cmp $3, %r8
> - je L(Exit3)
> - cmpb $0, 2(%rcx)
> - jz L(Exit3)
> - cmp $4, %r8
> - je L(Exit4)
> - cmpb $0, 3(%rcx)
> - jz L(Exit4)
> - cmp $5, %r8
> - je L(Exit5)
> - cmpb $0, 4(%rcx)
> - jz L(Exit5)
> - cmp $6, %r8
> - je L(Exit6)
> - cmpb $0, 5(%rcx)
> - jz L(Exit6)
> - cmp $7, %r8
> - je L(Exit7)
> - cmpb $0, 6(%rcx)
> - jz L(Exit7)
> - mov (%rcx), %rax
> - mov %rax, (%rdx)
> -# ifdef USE_AS_STPCPY
> - lea 7(%rdx), %rax
> - cmpb $1, (%rax)
> - sbb $-1, %rax
> -# else
> - mov %rdi, %rax
> -# endif
> - ret
> -
> -# endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> - .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> - test %rax, %rax
> - jnz L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> - lea 64(%r8), %r8
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase3)
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> - pcmpeqb %xmm4, %xmm0
> - pmovmskb %xmm0, %rax
> - add $48, %r8
> - jle L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm5, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm4, -64(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm6, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm5, -48(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(CopyFrom1To16BytesCase2OrCase3)
> - test %rax, %rax
> - jnz L(CopyFrom1To16Bytes)
> -
> - pcmpeqb %xmm7, %xmm0
> - pmovmskb %xmm0, %rax
> - movaps %xmm6, -32(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> - jmp L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> - .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> - movdqu -1(%rcx), %xmm0
> - movdqu %xmm0, -1(%rdx)
> - mov $15, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> - movdqu -2(%rcx), %xmm0
> - movdqu %xmm0, -2(%rdx)
> - mov $14, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> - movdqu -3(%rcx), %xmm0
> - movdqu %xmm0, -3(%rdx)
> - mov $13, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> - movdqu -4(%rcx), %xmm0
> - movdqu %xmm0, -4(%rdx)
> - mov $12, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> - movdqu -5(%rcx), %xmm0
> - movdqu %xmm0, -5(%rdx)
> - mov $11, %rsi
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 6(%rcx), %r9d
> - mov %r9d, 6(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $10, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> - mov (%rcx), %rsi
> - mov 5(%rcx), %r9d
> - mov %r9d, 5(%rdx)
> - mov %rsi, (%rdx)
> - test %rax, %rax
> - mov $9, %rsi
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> - mov (%rcx), %r9
> - mov $8, %rsi
> - mov %r9, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> - mov -1(%rcx), %r9
> - mov $7, %rsi
> - mov %r9, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> - mov -2(%rcx), %r9
> - mov $6, %rsi
> - mov %r9, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> - mov -3(%rcx), %r9
> - mov $5, %rsi
> - mov %r9, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> - mov (%rcx), %r9d
> - mov $4, %rsi
> - mov %r9d, (%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> - mov -1(%rcx), %r9d
> - mov $3, %rsi
> - mov %r9d, -1(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> - mov -2(%rcx), %r9d
> - mov $2, %rsi
> - mov %r9d, -2(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> - mov -3(%rcx), %r9d
> - mov $1, %rsi
> - mov %r9d, -3(%rdx)
> - test %rax, %rax
> - jnz L(CopyFrom1To16BytesCase2)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave1):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit1)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 31(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit1)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit1):
> - lea 15(%rdx, %rsi), %rdx
> - lea 15(%rcx, %rsi), %rcx
> - mov -15(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -15(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave2):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit2)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 30(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit2)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit2):
> - lea 14(%rdx, %rsi), %rdx
> - lea 14(%rcx, %rsi), %rcx
> - mov -14(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -14(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave3):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit3)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 29(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit3)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit3):
> - lea 13(%rdx, %rsi), %rdx
> - lea 13(%rcx, %rsi), %rcx
> - mov -13(%rcx), %rsi
> - mov -8(%rcx), %rax
> - mov %rsi, -13(%rdx)
> - mov %rax, -8(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave4):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit4)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 28(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit4)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit4):
> - lea 12(%rdx, %rsi), %rdx
> - lea 12(%rcx, %rsi), %rcx
> - mov -12(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -12(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave5):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit5)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 27(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit5)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit5):
> - lea 11(%rdx, %rsi), %rdx
> - lea 11(%rcx, %rsi), %rcx
> - mov -11(%rcx), %rsi
> - mov -4(%rcx), %eax
> - mov %rsi, -11(%rdx)
> - mov %eax, -4(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave6):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit6)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 26(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit6)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit6):
> - lea 10(%rdx, %rsi), %rdx
> - lea 10(%rcx, %rsi), %rcx
> - mov -10(%rcx), %rsi
> - movw -2(%rcx), %ax
> - mov %rsi, -10(%rdx)
> - movw %ax, -2(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave7):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit7)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 25(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit7)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit7):
> - lea 9(%rdx, %rsi), %rdx
> - lea 9(%rcx, %rsi), %rcx
> - mov -9(%rcx), %rsi
> - movb -1(%rcx), %ah
> - mov %rsi, -9(%rdx)
> - movb %ah, -1(%rdx)
> - xor %rsi, %rsi
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave8):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit8)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 24(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit8)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit8):
> - lea 8(%rdx, %rsi), %rdx
> - lea 8(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave9):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit9)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 23(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit9)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit9):
> - lea 7(%rdx, %rsi), %rdx
> - lea 7(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave10):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit10)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 22(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit10)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit10):
> - lea 6(%rdx, %rsi), %rdx
> - lea 6(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave11):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit11)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 21(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit11)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit11):
> - lea 5(%rdx, %rsi), %rdx
> - lea 5(%rcx, %rsi), %rcx
> - mov -8(%rcx), %rax
> - xor %rsi, %rsi
> - mov %rax, -8(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave12):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit12)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 20(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit12)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit12):
> - lea 4(%rdx, %rsi), %rdx
> - lea 4(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave13):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit13)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 19(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit13)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit13):
> - lea 3(%rdx, %rsi), %rdx
> - lea 3(%rcx, %rsi), %rcx
> - mov -4(%rcx), %eax
> - xor %rsi, %rsi
> - mov %eax, -4(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave14):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit14)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 18(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit14)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit14):
> - lea 2(%rdx, %rsi), %rdx
> - lea 2(%rcx, %rsi), %rcx
> - movw -2(%rcx), %ax
> - xor %rsi, %rsi
> - movw %ax, -2(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> - .p2align 4
> -L(StrncpyLeave15):
> - movaps %xmm2, %xmm3
> - add $48, %r8
> - jle L(StrncpyExit15)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdx)
> - movaps 17(%rcx), %xmm2
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, 16(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm4, 32(%rdx)
> - lea 16(%rsi), %rsi
> - sub $16, %r8
> - jbe L(StrncpyExit15)
> - movaps %xmm5, 48(%rdx)
> - lea 16(%rsi), %rsi
> - lea -16(%r8), %r8
> -
> -L(StrncpyExit15):
> - lea 1(%rdx, %rsi), %rdx
> - lea 1(%rcx, %rsi), %rcx
> - movb -1(%rcx), %ah
> - xor %rsi, %rsi
> - movb %ah, -1(%rdx)
> - jmp L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
* Re: [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back
2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-14 18:13 ` H.J. Lu
0 siblings, 0 replies; 49+ messages in thread
From: H.J. Lu @ 2022-04-14 18:13 UTC (permalink / raw)
To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell
On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile | 2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 -
> sysdeps/x86_64/multiarch/ifunc-memmove.h | 16 +-
> sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 3181 -----------------
> sysdeps/x86_64/multiarch/memmove-ssse3-back.S | 4 -
> 5 files changed, 6 insertions(+), 3212 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5b02ec8de5..303fb5d734 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,6 @@ sysdep_routines += \
> memcmpeq-evex \
> memcmpeq-sse2 \
> memcpy-ssse3 \
> - memcpy-ssse3-back \
> memmove-avx-unaligned-erms \
> memmove-avx-unaligned-erms-rtm \
> memmove-avx512-no-vzeroupper \
> @@ -25,7 +24,6 @@ sysdep_routines += \
> memmove-evex-unaligned-erms \
> memmove-sse2-unaligned-erms \
> memmove-ssse3 \
> - memmove-ssse3-back \
> memrchr-avx2 \
> memrchr-avx2-rtm \
> memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 49ce6860d0..c6008a73ed 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memmove_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memmove_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memmove_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memmove_chk_ssse3)
> @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memmove,
> CPU_FEATURE_USABLE (AVX512VL),
> __memmove_avx512_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> - __memmove_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> __memmove_ssse3)
> IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __memcpy_chk_ssse3)
> @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, memcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __memcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> - __memcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> __memcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, memcpy,
> @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_chk_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> - CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_chk_ssse3_back)
> IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_chk_ssse3)
> @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
> IFUNC_IMPL_ADD (array, i, mempcpy,
> CPU_FEATURE_USABLE (AVX512VL),
> __mempcpy_evex_unaligned_erms)
> - IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> - __mempcpy_ssse3_back)
> IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> __mempcpy_ssse3)
> IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..fb01fbb301 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
> extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
> attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
> extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
> attribute_hidden;
> @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
> }
> }
>
> - if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> - || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> + && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> {
> - if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> - return OPTIMIZE (sse2_unaligned_erms);
> -
> - return OPTIMIZE (sse2_unaligned);
> + return OPTIMIZE (ssse3);
> }
>
> - if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> - return OPTIMIZE (ssse3_back);
> + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> + return OPTIMIZE (sse2_unaligned_erms);
>
> - return OPTIMIZE (ssse3);
> + return OPTIMIZE (sse2_unaligned);
> }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> - Copyright (C) 2010-2022 Free Software Foundation, Inc.
> - This file is part of the GNU C Library.
> -
> - The GNU C Library is free software; you can redistribute it and/or
> - modify it under the terms of the GNU Lesser General Public
> - License as published by the Free Software Foundation; either
> - version 2.1 of the License, or (at your option) any later version.
> -
> - The GNU C Library is distributed in the hope that it will be useful,
> - but WITHOUT ANY WARRANTY; without even the implied warranty of
> - MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
> - Lesser General Public License for more details.
> -
> - You should have received a copy of the GNU Lesser General Public
> - License along with the GNU C Library; if not, see
> - <https://www.gnu.org/licenses/>. */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY __memcpy_ssse3_back
> -# define MEMCPY_CHK __memcpy_chk_ssse3_back
> -# define MEMPCPY __mempcpy_ssse3_back
> -# define MEMPCPY_CHK __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B) I - B
> -
> -/* Branch to an entry in a jump table. TABLE is a jump table with
> - relative offsets. INDEX is a register contains the index into the
> - jump table. SCALE is the scale of INDEX. */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \
> - lea TABLE(%rip), %r11; \
> - movslq (%r11, INDEX, SCALE), INDEX; \
> - lea (%r11, INDEX), INDEX; \
> - _CET_NOTRACK jmp *INDEX; \
> - ud2
> -
> - .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> - mov %RDI_LP, %RAX_LP
> - add %RDX_LP, %RAX_LP
> - jmp L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> - cmp %RDX_LP, %RCX_LP
> - jb HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> - mov %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> - add %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> - /* Clear the upper 32 bits. */
> - mov %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> - cmp %rsi, %rdi
> - jb L(copy_forward)
> - je L(bwd_write_0bytes)
> - cmp $144, %rdx
> - jae L(copy_backward)
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> - cmp $144, %rdx
> - jae L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jbe L(bk_write)
> -#endif
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> - .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> - cmp %dil, %sil
> - jle L(copy_backward)
> -#endif
> - movdqu (%rsi), %xmm0
> - mov %rdi, %r8
> - and $-16, %rdi
> - add $16, %rdi
> - mov %rdi, %r9
> - sub %r8, %r9
> - sub %r9, %rdx
> - add %r9, %rsi
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - cmp %rcx, %rdx
> - jae L(gobble_mem_fwd)
> - lea L(shl_table_fwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> - mov $DATA_CACHE_SIZE, %RCX_LP
> -#else
> - mov __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> - shl $1, %rcx
> - cmp %rcx, %rdx
> - ja L(gobble_mem_bwd)
> -
> - add %rdx, %rdi
> - add %rdx, %rsi
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $0xf, %r9
> - xor %r9, %rdi
> - sub %r9, %rsi
> - sub %r9, %rdx
> - mov %rsi, %r9
> - and $0xf, %r9
> - jz L(shl_0_bwd)
> - lea L(shl_table_bwd)(%rip), %r11
> - sub $0x80, %rdx
> - movslq (%r11, %r9, 4), %r9
> - add %r11, %r9
> - _CET_NOTRACK jmp *%r9
> - ud2
> -
> - .p2align 4
> -L(shl_0):
> -
> - mov %rdx, %r9
> - shr $8, %r9
> - add %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> - cmp $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> - cmp __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> - jae L(gobble_mem_fwd)
> - sub $0x80, %rdx
> - .p2align 4
> -L(shl_0_loop):
> - movdqa (%rsi), %xmm1
> - movdqa %xmm1, (%rdi)
> - movaps 0x10(%rsi), %xmm2
> - movaps %xmm2, 0x10(%rdi)
> - movaps 0x20(%rsi), %xmm3
> - movaps %xmm3, 0x20(%rdi)
> - movaps 0x30(%rsi), %xmm4
> - movaps %xmm4, 0x30(%rdi)
> - movaps 0x40(%rsi), %xmm1
> - movaps %xmm1, 0x40(%rdi)
> - movaps 0x50(%rsi), %xmm2
> - movaps %xmm2, 0x50(%rdi)
> - movaps 0x60(%rsi), %xmm3
> - movaps %xmm3, 0x60(%rdi)
> - movaps 0x70(%rsi), %xmm4
> - movaps %xmm4, 0x70(%rdi)
> - sub $0x80, %rdx
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(shl_0_loop)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_0_bwd):
> - sub $0x80, %rdx
> -L(copy_backward_loop):
> - movaps -0x10(%rsi), %xmm1
> - movaps %xmm1, -0x10(%rdi)
> - movaps -0x20(%rsi), %xmm2
> - movaps %xmm2, -0x20(%rdi)
> - movaps -0x30(%rsi), %xmm3
> - movaps %xmm3, -0x30(%rdi)
> - movaps -0x40(%rsi), %xmm4
> - movaps %xmm4, -0x40(%rdi)
> - movaps -0x50(%rsi), %xmm5
> - movaps %xmm5, -0x50(%rdi)
> - movaps -0x60(%rsi), %xmm5
> - movaps %xmm5, -0x60(%rdi)
> - movaps -0x70(%rsi), %xmm5
> - movaps %xmm5, -0x70(%rdi)
> - movaps -0x80(%rsi), %xmm5
> - movaps %xmm5, -0x80(%rdi)
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(copy_backward_loop)
> -
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1):
> - sub $0x80, %rdx
> - movaps -0x01(%rsi), %xmm1
> - movaps 0x0f(%rsi), %xmm2
> - movaps 0x1f(%rsi), %xmm3
> - movaps 0x2f(%rsi), %xmm4
> - movaps 0x3f(%rsi), %xmm5
> - movaps 0x4f(%rsi), %xmm6
> - movaps 0x5f(%rsi), %xmm7
> - movaps 0x6f(%rsi), %xmm8
> - movaps 0x7f(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $1, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $1, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $1, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $1, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $1, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $1, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $1, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $1, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_1)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_1_bwd):
> - movaps -0x01(%rsi), %xmm1
> -
> - movaps -0x11(%rsi), %xmm2
> - palignr $1, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x21(%rsi), %xmm3
> - palignr $1, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x31(%rsi), %xmm4
> - palignr $1, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x41(%rsi), %xmm5
> - palignr $1, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x51(%rsi), %xmm6
> - palignr $1, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x61(%rsi), %xmm7
> - palignr $1, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x71(%rsi), %xmm8
> - palignr $1, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x81(%rsi), %xmm9
> - palignr $1, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_1_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2):
> - sub $0x80, %rdx
> - movaps -0x02(%rsi), %xmm1
> - movaps 0x0e(%rsi), %xmm2
> - movaps 0x1e(%rsi), %xmm3
> - movaps 0x2e(%rsi), %xmm4
> - movaps 0x3e(%rsi), %xmm5
> - movaps 0x4e(%rsi), %xmm6
> - movaps 0x5e(%rsi), %xmm7
> - movaps 0x6e(%rsi), %xmm8
> - movaps 0x7e(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $2, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $2, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $2, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $2, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $2, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $2, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $2, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $2, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_2)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_2_bwd):
> - movaps -0x02(%rsi), %xmm1
> -
> - movaps -0x12(%rsi), %xmm2
> - palignr $2, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x22(%rsi), %xmm3
> - palignr $2, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x32(%rsi), %xmm4
> - palignr $2, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x42(%rsi), %xmm5
> - palignr $2, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x52(%rsi), %xmm6
> - palignr $2, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x62(%rsi), %xmm7
> - palignr $2, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x72(%rsi), %xmm8
> - palignr $2, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x82(%rsi), %xmm9
> - palignr $2, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_2_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3):
> - sub $0x80, %rdx
> - movaps -0x03(%rsi), %xmm1
> - movaps 0x0d(%rsi), %xmm2
> - movaps 0x1d(%rsi), %xmm3
> - movaps 0x2d(%rsi), %xmm4
> - movaps 0x3d(%rsi), %xmm5
> - movaps 0x4d(%rsi), %xmm6
> - movaps 0x5d(%rsi), %xmm7
> - movaps 0x6d(%rsi), %xmm8
> - movaps 0x7d(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $3, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $3, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $3, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $3, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $3, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $3, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $3, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $3, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_3)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_3_bwd):
> - movaps -0x03(%rsi), %xmm1
> -
> - movaps -0x13(%rsi), %xmm2
> - palignr $3, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x23(%rsi), %xmm3
> - palignr $3, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x33(%rsi), %xmm4
> - palignr $3, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x43(%rsi), %xmm5
> - palignr $3, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x53(%rsi), %xmm6
> - palignr $3, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x63(%rsi), %xmm7
> - palignr $3, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x73(%rsi), %xmm8
> - palignr $3, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x83(%rsi), %xmm9
> - palignr $3, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_3_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4):
> - sub $0x80, %rdx
> - movaps -0x04(%rsi), %xmm1
> - movaps 0x0c(%rsi), %xmm2
> - movaps 0x1c(%rsi), %xmm3
> - movaps 0x2c(%rsi), %xmm4
> - movaps 0x3c(%rsi), %xmm5
> - movaps 0x4c(%rsi), %xmm6
> - movaps 0x5c(%rsi), %xmm7
> - movaps 0x6c(%rsi), %xmm8
> - movaps 0x7c(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $4, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $4, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $4, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $4, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $4, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $4, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $4, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $4, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_4)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_4_bwd):
> - movaps -0x04(%rsi), %xmm1
> -
> - movaps -0x14(%rsi), %xmm2
> - palignr $4, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x24(%rsi), %xmm3
> - palignr $4, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x34(%rsi), %xmm4
> - palignr $4, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x44(%rsi), %xmm5
> - palignr $4, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x54(%rsi), %xmm6
> - palignr $4, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x64(%rsi), %xmm7
> - palignr $4, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x74(%rsi), %xmm8
> - palignr $4, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x84(%rsi), %xmm9
> - palignr $4, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_4_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5):
> - sub $0x80, %rdx
> - movaps -0x05(%rsi), %xmm1
> - movaps 0x0b(%rsi), %xmm2
> - movaps 0x1b(%rsi), %xmm3
> - movaps 0x2b(%rsi), %xmm4
> - movaps 0x3b(%rsi), %xmm5
> - movaps 0x4b(%rsi), %xmm6
> - movaps 0x5b(%rsi), %xmm7
> - movaps 0x6b(%rsi), %xmm8
> - movaps 0x7b(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $5, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $5, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $5, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $5, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $5, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $5, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $5, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $5, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_5)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_5_bwd):
> - movaps -0x05(%rsi), %xmm1
> -
> - movaps -0x15(%rsi), %xmm2
> - palignr $5, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x25(%rsi), %xmm3
> - palignr $5, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x35(%rsi), %xmm4
> - palignr $5, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x45(%rsi), %xmm5
> - palignr $5, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x55(%rsi), %xmm6
> - palignr $5, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x65(%rsi), %xmm7
> - palignr $5, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x75(%rsi), %xmm8
> - palignr $5, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x85(%rsi), %xmm9
> - palignr $5, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_5_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6):
> - sub $0x80, %rdx
> - movaps -0x06(%rsi), %xmm1
> - movaps 0x0a(%rsi), %xmm2
> - movaps 0x1a(%rsi), %xmm3
> - movaps 0x2a(%rsi), %xmm4
> - movaps 0x3a(%rsi), %xmm5
> - movaps 0x4a(%rsi), %xmm6
> - movaps 0x5a(%rsi), %xmm7
> - movaps 0x6a(%rsi), %xmm8
> - movaps 0x7a(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $6, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $6, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $6, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $6, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $6, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $6, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $6, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $6, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_6)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_6_bwd):
> - movaps -0x06(%rsi), %xmm1
> -
> - movaps -0x16(%rsi), %xmm2
> - palignr $6, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x26(%rsi), %xmm3
> - palignr $6, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x36(%rsi), %xmm4
> - palignr $6, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x46(%rsi), %xmm5
> - palignr $6, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x56(%rsi), %xmm6
> - palignr $6, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x66(%rsi), %xmm7
> - palignr $6, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x76(%rsi), %xmm8
> - palignr $6, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x86(%rsi), %xmm9
> - palignr $6, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_6_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7):
> - sub $0x80, %rdx
> - movaps -0x07(%rsi), %xmm1
> - movaps 0x09(%rsi), %xmm2
> - movaps 0x19(%rsi), %xmm3
> - movaps 0x29(%rsi), %xmm4
> - movaps 0x39(%rsi), %xmm5
> - movaps 0x49(%rsi), %xmm6
> - movaps 0x59(%rsi), %xmm7
> - movaps 0x69(%rsi), %xmm8
> - movaps 0x79(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $7, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $7, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $7, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $7, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $7, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $7, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $7, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $7, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_7)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_7_bwd):
> - movaps -0x07(%rsi), %xmm1
> -
> - movaps -0x17(%rsi), %xmm2
> - palignr $7, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x27(%rsi), %xmm3
> - palignr $7, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x37(%rsi), %xmm4
> - palignr $7, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x47(%rsi), %xmm5
> - palignr $7, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x57(%rsi), %xmm6
> - palignr $7, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x67(%rsi), %xmm7
> - palignr $7, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x77(%rsi), %xmm8
> - palignr $7, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x87(%rsi), %xmm9
> - palignr $7, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_7_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8):
> - sub $0x80, %rdx
> - movaps -0x08(%rsi), %xmm1
> - movaps 0x08(%rsi), %xmm2
> - movaps 0x18(%rsi), %xmm3
> - movaps 0x28(%rsi), %xmm4
> - movaps 0x38(%rsi), %xmm5
> - movaps 0x48(%rsi), %xmm6
> - movaps 0x58(%rsi), %xmm7
> - movaps 0x68(%rsi), %xmm8
> - movaps 0x78(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $8, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $8, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $8, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $8, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $8, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $8, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $8, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $8, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_8)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_8_bwd):
> - movaps -0x08(%rsi), %xmm1
> -
> - movaps -0x18(%rsi), %xmm2
> - palignr $8, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x28(%rsi), %xmm3
> - palignr $8, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x38(%rsi), %xmm4
> - palignr $8, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x48(%rsi), %xmm5
> - palignr $8, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x58(%rsi), %xmm6
> - palignr $8, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x68(%rsi), %xmm7
> - palignr $8, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x78(%rsi), %xmm8
> - palignr $8, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x88(%rsi), %xmm9
> - palignr $8, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_8_bwd)
> -L(shl_8_end_bwd):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9):
> - sub $0x80, %rdx
> - movaps -0x09(%rsi), %xmm1
> - movaps 0x07(%rsi), %xmm2
> - movaps 0x17(%rsi), %xmm3
> - movaps 0x27(%rsi), %xmm4
> - movaps 0x37(%rsi), %xmm5
> - movaps 0x47(%rsi), %xmm6
> - movaps 0x57(%rsi), %xmm7
> - movaps 0x67(%rsi), %xmm8
> - movaps 0x77(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $9, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $9, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $9, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $9, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $9, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $9, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $9, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $9, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_9)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_9_bwd):
> - movaps -0x09(%rsi), %xmm1
> -
> - movaps -0x19(%rsi), %xmm2
> - palignr $9, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x29(%rsi), %xmm3
> - palignr $9, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x39(%rsi), %xmm4
> - palignr $9, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x49(%rsi), %xmm5
> - palignr $9, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x59(%rsi), %xmm6
> - palignr $9, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x69(%rsi), %xmm7
> - palignr $9, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x79(%rsi), %xmm8
> - palignr $9, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x89(%rsi), %xmm9
> - palignr $9, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_9_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10):
> - sub $0x80, %rdx
> - movaps -0x0a(%rsi), %xmm1
> - movaps 0x06(%rsi), %xmm2
> - movaps 0x16(%rsi), %xmm3
> - movaps 0x26(%rsi), %xmm4
> - movaps 0x36(%rsi), %xmm5
> - movaps 0x46(%rsi), %xmm6
> - movaps 0x56(%rsi), %xmm7
> - movaps 0x66(%rsi), %xmm8
> - movaps 0x76(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $10, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $10, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $10, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $10, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $10, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $10, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $10, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $10, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_10)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_10_bwd):
> - movaps -0x0a(%rsi), %xmm1
> -
> - movaps -0x1a(%rsi), %xmm2
> - palignr $10, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2a(%rsi), %xmm3
> - palignr $10, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3a(%rsi), %xmm4
> - palignr $10, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4a(%rsi), %xmm5
> - palignr $10, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5a(%rsi), %xmm6
> - palignr $10, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6a(%rsi), %xmm7
> - palignr $10, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7a(%rsi), %xmm8
> - palignr $10, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8a(%rsi), %xmm9
> - palignr $10, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_10_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11):
> - sub $0x80, %rdx
> - movaps -0x0b(%rsi), %xmm1
> - movaps 0x05(%rsi), %xmm2
> - movaps 0x15(%rsi), %xmm3
> - movaps 0x25(%rsi), %xmm4
> - movaps 0x35(%rsi), %xmm5
> - movaps 0x45(%rsi), %xmm6
> - movaps 0x55(%rsi), %xmm7
> - movaps 0x65(%rsi), %xmm8
> - movaps 0x75(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $11, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $11, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $11, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $11, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $11, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $11, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $11, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $11, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_11)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_11_bwd):
> - movaps -0x0b(%rsi), %xmm1
> -
> - movaps -0x1b(%rsi), %xmm2
> - palignr $11, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2b(%rsi), %xmm3
> - palignr $11, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3b(%rsi), %xmm4
> - palignr $11, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4b(%rsi), %xmm5
> - palignr $11, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5b(%rsi), %xmm6
> - palignr $11, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6b(%rsi), %xmm7
> - palignr $11, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7b(%rsi), %xmm8
> - palignr $11, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8b(%rsi), %xmm9
> - palignr $11, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_11_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12):
> - sub $0x80, %rdx
> - movdqa -0x0c(%rsi), %xmm1
> - movaps 0x04(%rsi), %xmm2
> - movaps 0x14(%rsi), %xmm3
> - movaps 0x24(%rsi), %xmm4
> - movaps 0x34(%rsi), %xmm5
> - movaps 0x44(%rsi), %xmm6
> - movaps 0x54(%rsi), %xmm7
> - movaps 0x64(%rsi), %xmm8
> - movaps 0x74(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $12, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $12, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $12, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $12, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $12, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $12, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $12, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $12, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> -
> - lea 0x80(%rdi), %rdi
> - jae L(shl_12)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_12_bwd):
> - movaps -0x0c(%rsi), %xmm1
> -
> - movaps -0x1c(%rsi), %xmm2
> - palignr $12, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2c(%rsi), %xmm3
> - palignr $12, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3c(%rsi), %xmm4
> - palignr $12, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4c(%rsi), %xmm5
> - palignr $12, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5c(%rsi), %xmm6
> - palignr $12, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6c(%rsi), %xmm7
> - palignr $12, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7c(%rsi), %xmm8
> - palignr $12, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8c(%rsi), %xmm9
> - palignr $12, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_12_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13):
> - sub $0x80, %rdx
> - movaps -0x0d(%rsi), %xmm1
> - movaps 0x03(%rsi), %xmm2
> - movaps 0x13(%rsi), %xmm3
> - movaps 0x23(%rsi), %xmm4
> - movaps 0x33(%rsi), %xmm5
> - movaps 0x43(%rsi), %xmm6
> - movaps 0x53(%rsi), %xmm7
> - movaps 0x63(%rsi), %xmm8
> - movaps 0x73(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $13, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $13, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $13, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $13, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $13, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $13, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $13, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $13, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_13)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_13_bwd):
> - movaps -0x0d(%rsi), %xmm1
> -
> - movaps -0x1d(%rsi), %xmm2
> - palignr $13, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2d(%rsi), %xmm3
> - palignr $13, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3d(%rsi), %xmm4
> - palignr $13, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4d(%rsi), %xmm5
> - palignr $13, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5d(%rsi), %xmm6
> - palignr $13, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6d(%rsi), %xmm7
> - palignr $13, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7d(%rsi), %xmm8
> - palignr $13, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8d(%rsi), %xmm9
> - palignr $13, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_13_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14):
> - sub $0x80, %rdx
> - movaps -0x0e(%rsi), %xmm1
> - movaps 0x02(%rsi), %xmm2
> - movaps 0x12(%rsi), %xmm3
> - movaps 0x22(%rsi), %xmm4
> - movaps 0x32(%rsi), %xmm5
> - movaps 0x42(%rsi), %xmm6
> - movaps 0x52(%rsi), %xmm7
> - movaps 0x62(%rsi), %xmm8
> - movaps 0x72(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $14, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $14, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $14, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $14, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $14, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $14, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $14, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $14, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_14)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_14_bwd):
> - movaps -0x0e(%rsi), %xmm1
> -
> - movaps -0x1e(%rsi), %xmm2
> - palignr $14, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2e(%rsi), %xmm3
> - palignr $14, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3e(%rsi), %xmm4
> - palignr $14, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4e(%rsi), %xmm5
> - palignr $14, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5e(%rsi), %xmm6
> - palignr $14, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6e(%rsi), %xmm7
> - palignr $14, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7e(%rsi), %xmm8
> - palignr $14, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8e(%rsi), %xmm9
> - palignr $14, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_14_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15):
> - sub $0x80, %rdx
> - movaps -0x0f(%rsi), %xmm1
> - movaps 0x01(%rsi), %xmm2
> - movaps 0x11(%rsi), %xmm3
> - movaps 0x21(%rsi), %xmm4
> - movaps 0x31(%rsi), %xmm5
> - movaps 0x41(%rsi), %xmm6
> - movaps 0x51(%rsi), %xmm7
> - movaps 0x61(%rsi), %xmm8
> - movaps 0x71(%rsi), %xmm9
> - lea 0x80(%rsi), %rsi
> - palignr $15, %xmm8, %xmm9
> - movaps %xmm9, 0x70(%rdi)
> - palignr $15, %xmm7, %xmm8
> - movaps %xmm8, 0x60(%rdi)
> - palignr $15, %xmm6, %xmm7
> - movaps %xmm7, 0x50(%rdi)
> - palignr $15, %xmm5, %xmm6
> - movaps %xmm6, 0x40(%rdi)
> - palignr $15, %xmm4, %xmm5
> - movaps %xmm5, 0x30(%rdi)
> - palignr $15, %xmm3, %xmm4
> - movaps %xmm4, 0x20(%rdi)
> - palignr $15, %xmm2, %xmm3
> - movaps %xmm3, 0x10(%rdi)
> - palignr $15, %xmm1, %xmm2
> - movaps %xmm2, (%rdi)
> - lea 0x80(%rdi), %rdi
> - jae L(shl_15)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - add %rdx, %rdi
> - add %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(shl_15_bwd):
> - movaps -0x0f(%rsi), %xmm1
> -
> - movaps -0x1f(%rsi), %xmm2
> - palignr $15, %xmm2, %xmm1
> - movaps %xmm1, -0x10(%rdi)
> -
> - movaps -0x2f(%rsi), %xmm3
> - palignr $15, %xmm3, %xmm2
> - movaps %xmm2, -0x20(%rdi)
> -
> - movaps -0x3f(%rsi), %xmm4
> - palignr $15, %xmm4, %xmm3
> - movaps %xmm3, -0x30(%rdi)
> -
> - movaps -0x4f(%rsi), %xmm5
> - palignr $15, %xmm5, %xmm4
> - movaps %xmm4, -0x40(%rdi)
> -
> - movaps -0x5f(%rsi), %xmm6
> - palignr $15, %xmm6, %xmm5
> - movaps %xmm5, -0x50(%rdi)
> -
> - movaps -0x6f(%rsi), %xmm7
> - palignr $15, %xmm7, %xmm6
> - movaps %xmm6, -0x60(%rdi)
> -
> - movaps -0x7f(%rsi), %xmm8
> - palignr $15, %xmm8, %xmm7
> - movaps %xmm7, -0x70(%rdi)
> -
> - movaps -0x8f(%rsi), %xmm9
> - palignr $15, %xmm9, %xmm8
> - movaps %xmm8, -0x80(%rdi)
> -
> - sub $0x80, %rdx
> - lea -0x80(%rdi), %rdi
> - lea -0x80(%rsi), %rsi
> - jae L(shl_15_bwd)
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rdi
> - sub %rdx, %rsi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_fwd):
> - movdqu (%rsi), %xmm1
> - movdqu %xmm0, (%r8)
> - movdqa %xmm1, (%rdi)
> - sub $16, %rdx
> - add $16, %rsi
> - add $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rsi, %r9
> - sub %rdi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_fwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger_in_fwd)
> - mov %rdx, %rcx
> -L(bigger_in_fwd):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy_fwd)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy_fwd)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy_fwd):
> - sub $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 0x200(%rsi)
> - prefetcht0 0x300(%rsi)
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - lfence
> - movntdq %xmm0, (%rdi)
> - movntdq %xmm1, 0x10(%rdi)
> - movntdq %xmm2, 0x20(%rdi)
> - movntdq %xmm3, 0x30(%rdi)
> - movntdq %xmm4, 0x40(%rdi)
> - movntdq %xmm5, 0x50(%rdi)
> - movntdq %xmm6, 0x60(%rdi)
> - movntdq %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_mem_fwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_fwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy_fwd):
> - add %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> - prefetchnta 0x1c0(%rsi)
> - prefetchnta 0x280(%rsi)
> - prefetchnta 0x1c0(%rdi)
> - prefetchnta 0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu (%rsi), %xmm0
> - movdqu 0x10(%rsi), %xmm1
> - movdqu 0x20(%rsi), %xmm2
> - movdqu 0x30(%rsi), %xmm3
> - movdqu 0x40(%rsi), %xmm4
> - movdqu 0x50(%rsi), %xmm5
> - movdqu 0x60(%rsi), %xmm6
> - movdqu 0x70(%rsi), %xmm7
> - movdqa %xmm0, (%rdi)
> - movdqa %xmm1, 0x10(%rdi)
> - movdqa %xmm2, 0x20(%rdi)
> - movdqa %xmm3, 0x30(%rdi)
> - movdqa %xmm4, 0x40(%rdi)
> - movdqa %xmm5, 0x50(%rdi)
> - movdqa %xmm6, 0x60(%rdi)
> - movdqa %xmm7, 0x70(%rdi)
> - lea 0x80(%rsi), %rsi
> - lea 0x80(%rdi), %rdi
> - jae L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> - add $0x80, %rdx
> - add %rdx, %rsi
> - add %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> - .p2align 4
> -L(gobble_mem_bwd):
> - add %rdx, %rsi
> - add %rdx, %rdi
> -
> - movdqu -16(%rsi), %xmm0
> - lea -16(%rdi), %r8
> - mov %rdi, %r9
> - and $-16, %rdi
> - sub %rdi, %r9
> - sub %r9, %rsi
> - sub %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> - mov $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> - mov __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> - mov %rdi, %r9
> - sub %rsi, %r9
> - cmp %rdx, %r9
> - jae L(memmove_is_memcpy_bwd)
> - cmp %rcx, %r9
> - jbe L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> - cmp %rcx, %rdx
> - ja L(bigger)
> - mov %rdx, %rcx
> -L(bigger):
> - sub %rcx, %rdx
> - cmp $0x1000, %rdx
> - jbe L(ll_cache_copy)
> -
> - mov %rcx, %r9
> - shl $3, %r9
> - cmp %r9, %rdx
> - jbe L(2steps_copy)
> - add %rcx, %rdx
> - xor %rcx, %rcx
> -L(2steps_copy):
> - sub $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> - sub $0x80, %rdx
> - prefetcht0 -0x200(%rsi)
> - prefetcht0 -0x300(%rsi)
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - lfence
> - movntdq %xmm1, -0x10(%rdi)
> - movntdq %xmm2, -0x20(%rdi)
> - movntdq %xmm3, -0x30(%rdi)
> - movntdq %xmm4, -0x40(%rdi)
> - movntdq %xmm5, -0x50(%rdi)
> - movntdq %xmm6, -0x60(%rdi)
> - movntdq %xmm7, -0x70(%rdi)
> - movntdq %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_mem_bwd_loop)
> - sfence
> - cmp $0x80, %rcx
> - jb L(gobble_mem_bwd_end)
> - add $0x80, %rdx
> -L(ll_cache_copy):
> - add %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> - sub $0x80, %rdx
> -L(gobble_ll_loop):
> - prefetchnta -0x1c0(%rsi)
> - prefetchnta -0x280(%rsi)
> - prefetchnta -0x1c0(%rdi)
> - prefetchnta -0x280(%rdi)
> - sub $0x80, %rdx
> - movdqu -0x10(%rsi), %xmm1
> - movdqu -0x20(%rsi), %xmm2
> - movdqu -0x30(%rsi), %xmm3
> - movdqu -0x40(%rsi), %xmm4
> - movdqu -0x50(%rsi), %xmm5
> - movdqu -0x60(%rsi), %xmm6
> - movdqu -0x70(%rsi), %xmm7
> - movdqu -0x80(%rsi), %xmm8
> - movdqa %xmm1, -0x10(%rdi)
> - movdqa %xmm2, -0x20(%rdi)
> - movdqa %xmm3, -0x30(%rdi)
> - movdqa %xmm4, -0x40(%rdi)
> - movdqa %xmm5, -0x50(%rdi)
> - movdqa %xmm6, -0x60(%rdi)
> - movdqa %xmm7, -0x70(%rdi)
> - movdqa %xmm8, -0x80(%rdi)
> - lea -0x80(%rsi), %rsi
> - lea -0x80(%rdi), %rdi
> - jae L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> - movdqu %xmm0, (%r8)
> - add $0x80, %rdx
> - sub %rdx, %rsi
> - sub %rdx, %rdi
> - BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> - .p2align 4
> -L(fwd_write_128bytes):
> - lddqu -128(%rsi), %xmm0
> - movdqu %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> - lddqu -112(%rsi), %xmm0
> - movdqu %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> - lddqu -96(%rsi), %xmm0
> - movdqu %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> - lddqu -80(%rsi), %xmm0
> - movdqu %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> - lddqu -64(%rsi), %xmm0
> - movdqu %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> - lddqu -48(%rsi), %xmm0
> - movdqu %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> - lddqu -32(%rsi), %xmm0
> - movdqu %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> - lddqu -16(%rsi), %xmm0
> - movdqu %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> - ret
> -
> -
> - .p2align 4
> -L(fwd_write_143bytes):
> - lddqu -143(%rsi), %xmm0
> - movdqu %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> - lddqu -127(%rsi), %xmm0
> - movdqu %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> - lddqu -111(%rsi), %xmm0
> - movdqu %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> - lddqu -95(%rsi), %xmm0
> - movdqu %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> - lddqu -79(%rsi), %xmm0
> - movdqu %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> - lddqu -63(%rsi), %xmm0
> - movdqu %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> - lddqu -47(%rsi), %xmm0
> - movdqu %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> - lddqu -31(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -31(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_15bytes):
> - mov -15(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -15(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_142bytes):
> - lddqu -142(%rsi), %xmm0
> - movdqu %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> - lddqu -126(%rsi), %xmm0
> - movdqu %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> - lddqu -110(%rsi), %xmm0
> - movdqu %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> - lddqu -94(%rsi), %xmm0
> - movdqu %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> - lddqu -78(%rsi), %xmm0
> - movdqu %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> - lddqu -62(%rsi), %xmm0
> - movdqu %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> - lddqu -46(%rsi), %xmm0
> - movdqu %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> - lddqu -30(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -30(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_14bytes):
> - mov -14(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -14(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_141bytes):
> - lddqu -141(%rsi), %xmm0
> - movdqu %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> - lddqu -125(%rsi), %xmm0
> - movdqu %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> - lddqu -109(%rsi), %xmm0
> - movdqu %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> - lddqu -93(%rsi), %xmm0
> - movdqu %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> - lddqu -77(%rsi), %xmm0
> - movdqu %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> - lddqu -61(%rsi), %xmm0
> - movdqu %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> - lddqu -45(%rsi), %xmm0
> - movdqu %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> - lddqu -29(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -29(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_13bytes):
> - mov -13(%rsi), %rdx
> - mov -8(%rsi), %rcx
> - mov %rdx, -13(%rdi)
> - mov %rcx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_140bytes):
> - lddqu -140(%rsi), %xmm0
> - movdqu %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> - lddqu -124(%rsi), %xmm0
> - movdqu %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> - lddqu -108(%rsi), %xmm0
> - movdqu %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> - lddqu -92(%rsi), %xmm0
> - movdqu %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> - lddqu -76(%rsi), %xmm0
> - movdqu %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> - lddqu -60(%rsi), %xmm0
> - movdqu %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> - lddqu -44(%rsi), %xmm0
> - movdqu %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> - lddqu -28(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -28(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_12bytes):
> - mov -12(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -12(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_139bytes):
> - lddqu -139(%rsi), %xmm0
> - movdqu %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> - lddqu -123(%rsi), %xmm0
> - movdqu %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> - lddqu -107(%rsi), %xmm0
> - movdqu %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> - lddqu -91(%rsi), %xmm0
> - movdqu %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> - lddqu -75(%rsi), %xmm0
> - movdqu %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> - lddqu -59(%rsi), %xmm0
> - movdqu %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> - lddqu -43(%rsi), %xmm0
> - movdqu %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> - lddqu -27(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -27(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_11bytes):
> - mov -11(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -11(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_138bytes):
> - lddqu -138(%rsi), %xmm0
> - movdqu %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> - lddqu -122(%rsi), %xmm0
> - movdqu %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> - lddqu -106(%rsi), %xmm0
> - movdqu %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> - lddqu -90(%rsi), %xmm0
> - movdqu %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> - lddqu -74(%rsi), %xmm0
> - movdqu %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> - lddqu -58(%rsi), %xmm0
> - movdqu %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> - lddqu -42(%rsi), %xmm0
> - movdqu %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> - lddqu -26(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -26(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_10bytes):
> - mov -10(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -10(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_137bytes):
> - lddqu -137(%rsi), %xmm0
> - movdqu %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> - lddqu -121(%rsi), %xmm0
> - movdqu %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> - lddqu -105(%rsi), %xmm0
> - movdqu %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> - lddqu -89(%rsi), %xmm0
> - movdqu %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> - lddqu -73(%rsi), %xmm0
> - movdqu %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> - lddqu -57(%rsi), %xmm0
> - movdqu %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> - lddqu -41(%rsi), %xmm0
> - movdqu %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> - lddqu -25(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -25(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_9bytes):
> - mov -9(%rsi), %rdx
> - mov -4(%rsi), %ecx
> - mov %rdx, -9(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_136bytes):
> - lddqu -136(%rsi), %xmm0
> - movdqu %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> - lddqu -120(%rsi), %xmm0
> - movdqu %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> - lddqu -104(%rsi), %xmm0
> - movdqu %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> - lddqu -88(%rsi), %xmm0
> - movdqu %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> - lddqu -72(%rsi), %xmm0
> - movdqu %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> - lddqu -56(%rsi), %xmm0
> - movdqu %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> - lddqu -40(%rsi), %xmm0
> - movdqu %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> - lddqu -24(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -24(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_8bytes):
> - mov -8(%rsi), %rdx
> - mov %rdx, -8(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_135bytes):
> - lddqu -135(%rsi), %xmm0
> - movdqu %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> - lddqu -119(%rsi), %xmm0
> - movdqu %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> - lddqu -103(%rsi), %xmm0
> - movdqu %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> - lddqu -87(%rsi), %xmm0
> - movdqu %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> - lddqu -71(%rsi), %xmm0
> - movdqu %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> - lddqu -55(%rsi), %xmm0
> - movdqu %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> - lddqu -39(%rsi), %xmm0
> - movdqu %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> - lddqu -23(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -23(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_7bytes):
> - mov -7(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -7(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_134bytes):
> - lddqu -134(%rsi), %xmm0
> - movdqu %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> - lddqu -118(%rsi), %xmm0
> - movdqu %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> - lddqu -102(%rsi), %xmm0
> - movdqu %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> - lddqu -86(%rsi), %xmm0
> - movdqu %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> - lddqu -70(%rsi), %xmm0
> - movdqu %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> - lddqu -54(%rsi), %xmm0
> - movdqu %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> - lddqu -38(%rsi), %xmm0
> - movdqu %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> - lddqu -22(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -22(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_6bytes):
> - mov -6(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -6(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_133bytes):
> - lddqu -133(%rsi), %xmm0
> - movdqu %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> - lddqu -117(%rsi), %xmm0
> - movdqu %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> - lddqu -101(%rsi), %xmm0
> - movdqu %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> - lddqu -85(%rsi), %xmm0
> - movdqu %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> - lddqu -69(%rsi), %xmm0
> - movdqu %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> - lddqu -53(%rsi), %xmm0
> - movdqu %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> - lddqu -37(%rsi), %xmm0
> - movdqu %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> - lddqu -21(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -21(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_5bytes):
> - mov -5(%rsi), %edx
> - mov -4(%rsi), %ecx
> - mov %edx, -5(%rdi)
> - mov %ecx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_132bytes):
> - lddqu -132(%rsi), %xmm0
> - movdqu %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> - lddqu -116(%rsi), %xmm0
> - movdqu %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> - lddqu -100(%rsi), %xmm0
> - movdqu %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> - lddqu -84(%rsi), %xmm0
> - movdqu %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> - lddqu -68(%rsi), %xmm0
> - movdqu %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> - lddqu -52(%rsi), %xmm0
> - movdqu %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> - lddqu -36(%rsi), %xmm0
> - movdqu %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> - lddqu -20(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -20(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_4bytes):
> - mov -4(%rsi), %edx
> - mov %edx, -4(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_131bytes):
> - lddqu -131(%rsi), %xmm0
> - movdqu %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> - lddqu -115(%rsi), %xmm0
> - movdqu %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> - lddqu -99(%rsi), %xmm0
> - movdqu %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> - lddqu -83(%rsi), %xmm0
> - movdqu %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> - lddqu -67(%rsi), %xmm0
> - movdqu %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> - lddqu -51(%rsi), %xmm0
> - movdqu %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> - lddqu -35(%rsi), %xmm0
> - movdqu %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> - lddqu -19(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -19(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_3bytes):
> - mov -3(%rsi), %dx
> - mov -2(%rsi), %cx
> - mov %dx, -3(%rdi)
> - mov %cx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_130bytes):
> - lddqu -130(%rsi), %xmm0
> - movdqu %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> - lddqu -114(%rsi), %xmm0
> - movdqu %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> - lddqu -98(%rsi), %xmm0
> - movdqu %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> - lddqu -82(%rsi), %xmm0
> - movdqu %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> - lddqu -66(%rsi), %xmm0
> - movdqu %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> - lddqu -50(%rsi), %xmm0
> - movdqu %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> - lddqu -34(%rsi), %xmm0
> - movdqu %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> - lddqu -18(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -18(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_2bytes):
> - movzwl -2(%rsi), %edx
> - mov %dx, -2(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_129bytes):
> - lddqu -129(%rsi), %xmm0
> - movdqu %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> - lddqu -113(%rsi), %xmm0
> - movdqu %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> - lddqu -97(%rsi), %xmm0
> - movdqu %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> - lddqu -81(%rsi), %xmm0
> - movdqu %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> - lddqu -65(%rsi), %xmm0
> - movdqu %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> - lddqu -49(%rsi), %xmm0
> - movdqu %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> - lddqu -33(%rsi), %xmm0
> - movdqu %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> - lddqu -17(%rsi), %xmm0
> - lddqu -16(%rsi), %xmm1
> - movdqu %xmm0, -17(%rdi)
> - movdqu %xmm1, -16(%rdi)
> - ret
> -
> - .p2align 4
> -L(fwd_write_1bytes):
> - movzbl -1(%rsi), %edx
> - mov %dl, -1(%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_128bytes):
> - lddqu 112(%rsi), %xmm0
> - movdqu %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> - lddqu 96(%rsi), %xmm0
> - movdqu %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> - lddqu 80(%rsi), %xmm0
> - movdqu %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> - lddqu 64(%rsi), %xmm0
> - movdqu %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> - lddqu 48(%rsi), %xmm0
> - movdqu %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> - lddqu 32(%rsi), %xmm0
> - movdqu %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> - lddqu 16(%rsi), %xmm0
> - movdqu %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> - lddqu (%rsi), %xmm0
> - movdqu %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> - ret
> -
> - .p2align 4
> -L(bwd_write_143bytes):
> - lddqu 127(%rsi), %xmm0
> - movdqu %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> - lddqu 111(%rsi), %xmm0
> - movdqu %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> - lddqu 95(%rsi), %xmm0
> - movdqu %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> - lddqu 79(%rsi), %xmm0
> - movdqu %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> - lddqu 63(%rsi), %xmm0
> - movdqu %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> - lddqu 47(%rsi), %xmm0
> - movdqu %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> - lddqu 31(%rsi), %xmm0
> - movdqu %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> - lddqu 15(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 15(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> -
> - .p2align 4
> -L(bwd_write_15bytes):
> - mov 7(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 7(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_142bytes):
> - lddqu 126(%rsi), %xmm0
> - movdqu %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> - lddqu 110(%rsi), %xmm0
> - movdqu %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> - lddqu 94(%rsi), %xmm0
> - movdqu %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> - lddqu 78(%rsi), %xmm0
> - movdqu %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> - lddqu 62(%rsi), %xmm0
> - movdqu %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> - lddqu 46(%rsi), %xmm0
> - movdqu %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> - lddqu 30(%rsi), %xmm0
> - movdqu %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> - lddqu 14(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 14(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_14bytes):
> - mov 6(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 6(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_141bytes):
> - lddqu 125(%rsi), %xmm0
> - movdqu %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> - lddqu 109(%rsi), %xmm0
> - movdqu %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> - lddqu 93(%rsi), %xmm0
> - movdqu %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> - lddqu 77(%rsi), %xmm0
> - movdqu %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> - lddqu 61(%rsi), %xmm0
> - movdqu %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> - lddqu 45(%rsi), %xmm0
> - movdqu %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> - lddqu 29(%rsi), %xmm0
> - movdqu %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> - lddqu 13(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 13(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_13bytes):
> - mov 5(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 5(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_140bytes):
> - lddqu 124(%rsi), %xmm0
> - movdqu %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> - lddqu 108(%rsi), %xmm0
> - movdqu %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> - lddqu 92(%rsi), %xmm0
> - movdqu %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> - lddqu 76(%rsi), %xmm0
> - movdqu %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> - lddqu 60(%rsi), %xmm0
> - movdqu %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> - lddqu 44(%rsi), %xmm0
> - movdqu %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> - lddqu 28(%rsi), %xmm0
> - movdqu %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> - lddqu 12(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 12(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_12bytes):
> - mov 4(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 4(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_139bytes):
> - lddqu 123(%rsi), %xmm0
> - movdqu %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> - lddqu 107(%rsi), %xmm0
> - movdqu %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> - lddqu 91(%rsi), %xmm0
> - movdqu %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> - lddqu 75(%rsi), %xmm0
> - movdqu %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> - lddqu 59(%rsi), %xmm0
> - movdqu %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> - lddqu 43(%rsi), %xmm0
> - movdqu %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> - lddqu 27(%rsi), %xmm0
> - movdqu %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> - lddqu 11(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 11(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_11bytes):
> - mov 3(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 3(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_138bytes):
> - lddqu 122(%rsi), %xmm0
> - movdqu %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> - lddqu 106(%rsi), %xmm0
> - movdqu %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> - lddqu 90(%rsi), %xmm0
> - movdqu %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> - lddqu 74(%rsi), %xmm0
> - movdqu %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> - lddqu 58(%rsi), %xmm0
> - movdqu %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> - lddqu 42(%rsi), %xmm0
> - movdqu %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> - lddqu 26(%rsi), %xmm0
> - movdqu %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> - lddqu 10(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 10(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_10bytes):
> - mov 2(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 2(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_137bytes):
> - lddqu 121(%rsi), %xmm0
> - movdqu %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> - lddqu 105(%rsi), %xmm0
> - movdqu %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> - lddqu 89(%rsi), %xmm0
> - movdqu %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> - lddqu 73(%rsi), %xmm0
> - movdqu %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> - lddqu 57(%rsi), %xmm0
> - movdqu %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> - lddqu 41(%rsi), %xmm0
> - movdqu %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> - lddqu 25(%rsi), %xmm0
> - movdqu %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> - lddqu 9(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 9(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_9bytes):
> - mov 1(%rsi), %rdx
> - mov (%rsi), %rcx
> - mov %rdx, 1(%rdi)
> - mov %rcx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_136bytes):
> - lddqu 120(%rsi), %xmm0
> - movdqu %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> - lddqu 104(%rsi), %xmm0
> - movdqu %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> - lddqu 88(%rsi), %xmm0
> - movdqu %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> - lddqu 72(%rsi), %xmm0
> - movdqu %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> - lddqu 56(%rsi), %xmm0
> - movdqu %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> - lddqu 40(%rsi), %xmm0
> - movdqu %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> - lddqu 24(%rsi), %xmm0
> - movdqu %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> - lddqu 8(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 8(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_8bytes):
> - mov (%rsi), %rdx
> - mov %rdx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_135bytes):
> - lddqu 119(%rsi), %xmm0
> - movdqu %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> - lddqu 103(%rsi), %xmm0
> - movdqu %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> - lddqu 87(%rsi), %xmm0
> - movdqu %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> - lddqu 71(%rsi), %xmm0
> - movdqu %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> - lddqu 55(%rsi), %xmm0
> - movdqu %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> - lddqu 39(%rsi), %xmm0
> - movdqu %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> - lddqu 23(%rsi), %xmm0
> - movdqu %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> - lddqu 7(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 7(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_7bytes):
> - mov 3(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 3(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_134bytes):
> - lddqu 118(%rsi), %xmm0
> - movdqu %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> - lddqu 102(%rsi), %xmm0
> - movdqu %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> - lddqu 86(%rsi), %xmm0
> - movdqu %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> - lddqu 70(%rsi), %xmm0
> - movdqu %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> - lddqu 54(%rsi), %xmm0
> - movdqu %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> - lddqu 38(%rsi), %xmm0
> - movdqu %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> - lddqu 22(%rsi), %xmm0
> - movdqu %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> - lddqu 6(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 6(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_6bytes):
> - mov 2(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 2(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_133bytes):
> - lddqu 117(%rsi), %xmm0
> - movdqu %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> - lddqu 101(%rsi), %xmm0
> - movdqu %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> - lddqu 85(%rsi), %xmm0
> - movdqu %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> - lddqu 69(%rsi), %xmm0
> - movdqu %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> - lddqu 53(%rsi), %xmm0
> - movdqu %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> - lddqu 37(%rsi), %xmm0
> - movdqu %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> - lddqu 21(%rsi), %xmm0
> - movdqu %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> - lddqu 5(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 5(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_5bytes):
> - mov 1(%rsi), %edx
> - mov (%rsi), %ecx
> - mov %edx, 1(%rdi)
> - mov %ecx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_132bytes):
> - lddqu 116(%rsi), %xmm0
> - movdqu %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> - lddqu 100(%rsi), %xmm0
> - movdqu %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> - lddqu 84(%rsi), %xmm0
> - movdqu %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> - lddqu 68(%rsi), %xmm0
> - movdqu %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> - lddqu 52(%rsi), %xmm0
> - movdqu %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> - lddqu 36(%rsi), %xmm0
> - movdqu %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> - lddqu 20(%rsi), %xmm0
> - movdqu %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> - lddqu 4(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 4(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_4bytes):
> - mov (%rsi), %edx
> - mov %edx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_131bytes):
> - lddqu 115(%rsi), %xmm0
> - movdqu %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> - lddqu 99(%rsi), %xmm0
> - movdqu %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> - lddqu 83(%rsi), %xmm0
> - movdqu %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> - lddqu 67(%rsi), %xmm0
> - movdqu %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> - lddqu 51(%rsi), %xmm0
> - movdqu %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> - lddqu 35(%rsi), %xmm0
> - movdqu %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> - lddqu 19(%rsi), %xmm0
> - movdqu %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> - lddqu 3(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 3(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_3bytes):
> - mov 1(%rsi), %dx
> - mov (%rsi), %cx
> - mov %dx, 1(%rdi)
> - mov %cx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_130bytes):
> - lddqu 114(%rsi), %xmm0
> - movdqu %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> - lddqu 98(%rsi), %xmm0
> - movdqu %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> - lddqu 82(%rsi), %xmm0
> - movdqu %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> - lddqu 66(%rsi), %xmm0
> - movdqu %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> - lddqu 50(%rsi), %xmm0
> - movdqu %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> - lddqu 34(%rsi), %xmm0
> - movdqu %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> - lddqu 18(%rsi), %xmm0
> - movdqu %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> - lddqu 2(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 2(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_2bytes):
> - movzwl (%rsi), %edx
> - mov %dx, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_129bytes):
> - lddqu 113(%rsi), %xmm0
> - movdqu %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> - lddqu 97(%rsi), %xmm0
> - movdqu %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> - lddqu 81(%rsi), %xmm0
> - movdqu %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> - lddqu 65(%rsi), %xmm0
> - movdqu %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> - lddqu 49(%rsi), %xmm0
> - movdqu %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> - lddqu 33(%rsi), %xmm0
> - movdqu %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> - lddqu 17(%rsi), %xmm0
> - movdqu %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> - lddqu 1(%rsi), %xmm0
> - lddqu (%rsi), %xmm1
> - movdqu %xmm0, 1(%rdi)
> - movdqu %xmm1, (%rdi)
> - ret
> -
> - .p2align 4
> -L(bwd_write_1bytes):
> - movzbl (%rsi), %edx
> - mov %dl, (%rdi)
> - ret
> -
> -END (MEMCPY)
> -
> - .section .rodata.ssse3,"a",@progbits
> - .p2align 3
> -L(table_144_bytes_bwd):
> - .int JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> - .int JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> - .p2align 3
> -L(table_144_bytes_fwd):
> - .int JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> - .int JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> - .p2align 3
> -L(shl_table_fwd):
> - .int JMPTBL (L(shl_0), L(shl_table_fwd))
> - .int JMPTBL (L(shl_1), L(shl_table_fwd))
> - .int JMPTBL (L(shl_2), L(shl_table_fwd))
> - .int JMPTBL (L(shl_3), L(shl_table_fwd))
> - .int JMPTBL (L(shl_4), L(shl_table_fwd))
> - .int JMPTBL (L(shl_5), L(shl_table_fwd))
> - .int JMPTBL (L(shl_6), L(shl_table_fwd))
> - .int JMPTBL (L(shl_7), L(shl_table_fwd))
> - .int JMPTBL (L(shl_8), L(shl_table_fwd))
> - .int JMPTBL (L(shl_9), L(shl_table_fwd))
> - .int JMPTBL (L(shl_10), L(shl_table_fwd))
> - .int JMPTBL (L(shl_11), L(shl_table_fwd))
> - .int JMPTBL (L(shl_12), L(shl_table_fwd))
> - .int JMPTBL (L(shl_13), L(shl_table_fwd))
> - .int JMPTBL (L(shl_14), L(shl_table_fwd))
> - .int JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> - .p2align 3
> -L(shl_table_bwd):
> - .int JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> - .int JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY __memmove_ssse3_back
> -#define MEMCPY_CHK __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>
LGTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Thanks.
--
H.J.
^ permalink raw reply [flat|nested] 49+ messages in thread
end of thread, other threads:[~2022-04-14 18:13 UTC | newest]
Thread overview: 49+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 19:55 ` H.J. Lu
2022-03-25 20:44 ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-04-10 0:57 ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 20:44 ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 1/6] " Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:42 ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-10 0:48 ` Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10 0:54 ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47 ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-14 18:05 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-14 18:06 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-14 18:10 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 18:13 ` H.J. Lu
2022-04-14 16:47 ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-14 18:04 ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 19:56 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 19:57 ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-03-25 19:57 ` H.J. Lu
2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 20:34 ` Andreas Schwab
2022-03-25 20:40 ` Noah Goldstein
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).