[PATCH v1 1/6] x86: Remove {w}memcmp-ssse3

public inbox for libc-alpha@sourceware.org
 help / color / mirror / Atom feed

* [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
@ 2022-03-25 18:36 Noah Goldstein
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                   ` (6 more replies)
  0 siblings, 7 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
 sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
 5 files changed, 2006 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
   memcmp-evex-movbe \
   memcmp-sse2 \
   memcmp-sse4 \
-  memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
   wmemcmp-c \
   wmemcmp-evex-movbe \
   wmemcmp-sse4 \
-  wmemcmp-ssse3 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_ssse3
-# endif
-
-/* Warning!
-	   wmemcmp has to use SIGNED comparison for elements.
-	   memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-	test	%RDX_LP, %RDX_LP
-	jz	L(equal)
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	mov	%rdx, %rcx
-	mov	%rdi, %rdx
-	cmp	$48, %rcx;
-	jae	L(48bytesormore)	/* LEN => 48  */
-
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-/* ECX >= 32.  */
-L(48bytesormore):
-	movdqu	(%rdi), %xmm3
-	movdqu	(%rsi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%rdi), %rdi
-	lea	16(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%rdx, %rdi
-	sub	%rdx, %rsi
-	add	%rdx, %rcx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
-	cmp	$8, %edx
-	jae	L(next_unaligned_table)
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$1, %edx
-	je	L(shr_1)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$3, %edx
-	je	L(shr_3)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$5, %edx
-	je	L(shr_5)
-	cmp	$6, %edx
-	je	L(shr_6)
-	jmp	L(shr_7)
-
-	.p2align 2
-L(next_unaligned_table):
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$9, %edx
-	je	L(shr_9)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$11, %edx
-	je	L(shr_11)
-	cmp	$12, %edx
-	je	L(shr_12)
-	cmp	$13, %edx
-	je	L(shr_13)
-	cmp	$14, %edx
-	je	L(shr_14)
-	jmp	L(shr_15)
-# else
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$8, %edx
-	je	L(shr_8)
-	jmp	L(shr_12)
-# endif
-
-	.p2align 4
-L(shr_0):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	jae	L(shr_0_gobble)
-	xor	%eax, %eax
-	movdqa	(%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_0_gobble):
-	movdqa	(%rsi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%rdi), %xmm0
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %rcx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%rsi), %xmm0
-	movdqa	48(%rsi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%rdi), %xmm0
-	pcmpeqb	48(%rdi), %xmm2
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %rcx
-	jge	L(next)
-	inc	%edx
-	add	$32, %rcx
-L(next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_1):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_1_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$1, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$1, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_1_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$1, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$1, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$1, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_1_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_1_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_1_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	1(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-
-	.p2align 4
-L(shr_2):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$2, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_2_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$2, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$2, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$2, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	2(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_3_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$3, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$3, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$3, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$3, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$3, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_3_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_3_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_3_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	3(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_4):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$4, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_4_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$4, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$4, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$4, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	4(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_5):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_5_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$5, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$5, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_5_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$5, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$5, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$5, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_5_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_5_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_5_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	5(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$6, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$6, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$6, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$6, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	6(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_7_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$7, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$7, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$7, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$7, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$7, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_7_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_7_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_7_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	7(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_8):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$8, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_8_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$8, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$8, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$8, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	8(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_9):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_9_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$9, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$9, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_9_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$9, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$9, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$9, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_9_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_9_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_9_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	9(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$10, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$10, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$10, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$10, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	10(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_11_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$11, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$11, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$11, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$11, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$11, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_11_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_11_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_11_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	11(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_12):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$12, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_12_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$12, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$12, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$12, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	12(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_13):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_13_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$13, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$13, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_13_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$13, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$13, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$13, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_13_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_13_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_13_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	13(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$14, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$14, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$14, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$14, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	14(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_15_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$15, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$15, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$15, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$15, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$15, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_15_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_15_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_15_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	15(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-# endif
-	.p2align 4
-L(exit):
-	pmovmskb %xmm1, %r8d
-	sub	$0xffff, %r8d
-	jz	L(first16bytes)
-	lea	-16(%rsi), %rsi
-	lea	-16(%rdi), %rdi
-	mov	%r8d, %edx
-L(first16bytes):
-	add	%rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
-	test	%dl, %dl
-	jz	L(next_24_bytes)
-
-	test	$0x01, %dl
-	jnz	L(Byte16)
-
-	test	$0x02, %dl
-	jnz	L(Byte17)
-
-	test	$0x04, %dl
-	jnz	L(Byte18)
-
-	test	$0x08, %dl
-	jnz	L(Byte19)
-
-	test	$0x10, %dl
-	jnz	L(Byte20)
-
-	test	$0x20, %dl
-	jnz	L(Byte21)
-
-	test	$0x40, %dl
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte16):
-	movzbl	-16(%rdi), %eax
-	movzbl	-16(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte17):
-	movzbl	-15(%rdi), %eax
-	movzbl	-15(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte18):
-	movzbl	-14(%rdi), %eax
-	movzbl	-14(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte19):
-	movzbl	-13(%rdi), %eax
-	movzbl	-13(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte20):
-	movzbl	-12(%rdi), %eax
-	movzbl	-12(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte21):
-	movzbl	-11(%rdi), %eax
-	movzbl	-11(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte22):
-	movzbl	-10(%rdi), %eax
-	movzbl	-10(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(next_24_bytes):
-	lea	8(%rdi), %rdi
-	lea	8(%rsi), %rsi
-	test	$0x01, %dh
-	jnz	L(Byte16)
-
-	test	$0x02, %dh
-	jnz	L(Byte17)
-
-	test	$0x04, %dh
-	jnz	L(Byte18)
-
-	test	$0x08, %dh
-	jnz	L(Byte19)
-
-	test	$0x10, %dh
-	jnz	L(Byte20)
-
-	test	$0x20, %dh
-	jnz	L(Byte21)
-
-	test	$0x40, %dh
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-# else
-/* special for wmemcmp */
-	xor	%eax, %eax
-	test	%dl, %dl
-	jz	L(next_two_double_words)
-	and	$15, %dl
-	jz	L(second_double_word)
-	mov	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(second_double_word):
-	mov	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(next_two_double_words):
-	and	$15, %dh
-	jz	L(fourth_double_word)
-	mov	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(fourth_double_word):
-	mov	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-	ret
-# endif
-
-	.p2align 4
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$0, %ecx
-	je	L(0bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %ecx
-	je	L(1bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-# else
-	jmp	L(4bytes)
-# endif
-
-	.p2align 4
-L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$9, %ecx
-	je	L(9bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$11, %ecx
-	je	L(11bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	cmp	$13, %ecx
-	je	L(13bytes)
-	cmp	$14, %ecx
-	je	L(14bytes)
-	jmp	L(15bytes)
-# else
-	jmp	L(12bytes)
-# endif
-
-	.p2align 4
-L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$17, %ecx
-	je	L(17bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$19, %ecx
-	je	L(19bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	cmp	$21, %ecx
-	je	L(21bytes)
-	cmp	$22, %ecx
-	je	L(22bytes)
-	jmp	L(23bytes)
-# else
-	jmp	L(20bytes)
-# endif
-
-	.p2align 4
-L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$25, %ecx
-	je	L(25bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$27, %ecx
-	je	L(27bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	cmp	$29, %ecx
-	je	L(29bytes)
-	cmp	$30, %ecx
-	je	L(30bytes)
-	jmp	L(31bytes)
-# else
-	jmp	L(28bytes)
-# endif
-
-	.p2align 4
-L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$33, %ecx
-	je	L(33bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$35, %ecx
-	je	L(35bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	cmp	$37, %ecx
-	je	L(37bytes)
-	cmp	$38, %ecx
-	je	L(38bytes)
-	jmp	L(39bytes)
-# else
-	jmp	L(36bytes)
-# endif
-
-	.p2align 4
-L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$41, %ecx
-	je	L(41bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$43, %ecx
-	je	L(43bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	cmp	$45, %ecx
-	je	L(45bytes)
-	cmp	$46, %ecx
-	je	L(46bytes)
-	jmp	L(47bytes)
-
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	movl	-44(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	movl	-40(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	movl	-36(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	movl	-32(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	movl	-28(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	movl	-24(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	movl	-20(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	movl	-16(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	movl	-12(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	movl	-8(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	movl	-4(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# else
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	cmp	-44(%rsi), %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	cmp	-40(%rsi), %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	cmp	-36(%rsi), %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	cmp	-32(%rsi), %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	cmp	-28(%rsi), %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	cmp	-24(%rsi), %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	cmp	-20(%rsi), %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(45bytes):
-	movl	-45(%rdi), %eax
-	movl	-45(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(41bytes):
-	movl	-41(%rdi), %eax
-	movl	-41(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(37bytes):
-	movl	-37(%rdi), %eax
-	movl	-37(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(33bytes):
-	movl	-33(%rdi), %eax
-	movl	-33(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(29bytes):
-	movl	-29(%rdi), %eax
-	movl	-29(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(25bytes):
-	movl	-25(%rdi), %eax
-	movl	-25(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(21bytes):
-	movl	-21(%rdi), %eax
-	movl	-21(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(17bytes):
-	movl	-17(%rdi), %eax
-	movl	-17(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(13bytes):
-	movl	-13(%rdi), %eax
-	movl	-13(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(9bytes):
-	movl	-9(%rdi), %eax
-	movl	-9(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(5bytes):
-	movl	-5(%rdi), %eax
-	movl	-5(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(46bytes):
-	movl	-46(%rdi), %eax
-	movl	-46(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(42bytes):
-	movl	-42(%rdi), %eax
-	movl	-42(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(38bytes):
-	movl	-38(%rdi), %eax
-	movl	-38(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(34bytes):
-	movl	-34(%rdi), %eax
-	movl	-34(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(30bytes):
-	movl	-30(%rdi), %eax
-	movl	-30(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(26bytes):
-	movl	-26(%rdi), %eax
-	movl	-26(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(22bytes):
-	movl	-22(%rdi), %eax
-	movl	-22(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(18bytes):
-	movl	-18(%rdi), %eax
-	movl	-18(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(14bytes):
-	movl	-14(%rdi), %eax
-	movl	-14(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(10bytes):
-	movl	-10(%rdi), %eax
-	movl	-10(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(6bytes):
-	movl	-6(%rdi), %eax
-	movl	-6(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(2bytes):
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(47bytes):
-	movl	-47(%rdi), %eax
-	movl	-47(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(43bytes):
-	movl	-43(%rdi), %eax
-	movl	-43(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(39bytes):
-	movl	-39(%rdi), %eax
-	movl	-39(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(35bytes):
-	movl	-35(%rdi), %eax
-	movl	-35(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(31bytes):
-	movl	-31(%rdi), %eax
-	movl	-31(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(27bytes):
-	movl	-27(%rdi), %eax
-	movl	-27(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(23bytes):
-	movl	-23(%rdi), %eax
-	movl	-23(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(19bytes):
-	movl	-19(%rdi), %eax
-	movl	-19(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(15bytes):
-	movl	-15(%rdi), %eax
-	movl	-15(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(11bytes):
-	movl	-11(%rdi), %eax
-	movl	-11(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(7bytes):
-	movl	-7(%rdi), %eax
-	movl	-7(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(find_diff):
-	cmpb	%cl, %al
-	jne	L(set)
-	cmpw	%cx, %ax
-	jne	L(set)
-	shr	$16, %eax
-	shr	$16, %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-
-/* We get there only if we already know there is a
-difference.  */
-
-	cmp	%ecx, %eax
-L(set):
-	sbb	%eax, %eax
-	sbb	$-1, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	.p2align 4
-L(find_diff):
-	mov	$1, %eax
-	jg	L(find_diff_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(find_diff_bigger):
-	ret
-# endif
-
-	.p2align 4
-L(equal):
-	xor	%eax, %eax
-	ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
  2022-03-25 19:55   ` H.J. Lu
                     ` (9 more replies)
  2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
                   ` (5 subsequent siblings)
  6 siblings, 10 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |   4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
 sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
 sysdeps/x86_64/multiarch/strcmp.c             |   4 -
 sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
 sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
 sysdeps/x86_64/multiarch/strncmp.c            |   4 -
 sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
 10 files changed, 30 insertions(+), 202 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
   strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
-  strcasecmp_l-ssse3 \
   strcat-avx2 \
   strcat-avx2-rtm \
   strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
-  strcmp-ssse3 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
   strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
-  strncase_l-ssse3 \
   strncat-avx2 \
   strncat-avx2-rtm \
   strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncmp-evex \
   strncmp-sse2 \
   strncmp-sse4_2 \
-  strncmp-ssse3 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
   strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 
 #ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
 # endif
 #endif
 
-#ifndef USE_SSSE3
 	.text
-#else
-	.section .text.ssse3,"ax",@progbits
-#endif
-
 #ifdef USE_AS_STRCASECMP_L
 # ifndef ENTRY2
 #  define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 19:55   ` H.J. Lu
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                     ` (8 subsequent siblings)
  9 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:55 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |   4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
>  sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
>  sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
>  sysdeps/x86_64/multiarch/strcmp.c             |   4 -
>  sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
>  sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
>  sysdeps/x86_64/multiarch/strncmp.c            |   4 -
>  sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
>  10 files changed, 30 insertions(+), 202 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 51222dfab1..ed2def288d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -58,7 +58,6 @@ sysdep_routines += \
>    strcasecmp_l-evex \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
> -  strcasecmp_l-ssse3 \
>    strcat-avx2 \
>    strcat-avx2-rtm \
>    strcat-evex \
> @@ -80,7 +79,6 @@ sysdep_routines += \
>    strcmp-sse2 \
>    strcmp-sse2-unaligned \
>    strcmp-sse4_2 \
> -  strcmp-ssse3 \
>    strcpy-avx2 \
>    strcpy-avx2-rtm \
>    strcpy-evex \
> @@ -98,7 +96,6 @@ sysdep_routines += \
>    strncase_l-evex \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
> -  strncase_l-ssse3 \
>    strncat-avx2 \
>    strncat-avx2-rtm \
>    strncat-c \
> @@ -110,7 +107,6 @@ sysdep_routines += \
>    strncmp-evex \
>    strncmp-sse2 \
>    strncmp-sse4_2 \
> -  strncmp-ssse3 \
>    strncpy-avx2 \
>    strncpy-avx2-rtm \
>    strncpy-c \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f389928a4e..7e2be3554b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strcasecmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_l_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strcasecmp_l_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
>                               __strcasecmp_l_sse2))
>
> @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strcmp_evex)
>               IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
>                               __strcmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
>
> @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strncasecmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
>                               __strncasecmp_sse2))
>
> @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_l_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strncasecmp_l_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
>                               __strncasecmp_l_sse2))
>
> @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strncmp_evex)
>               IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
>                               __strncmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>
>  #ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 766539c241..296d32071b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -20,7 +20,6 @@
>  #include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
>      return OPTIMIZE (sse42);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> deleted file mode 100644
> index fb2f9ae14a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strcasecmp_l_ssse3
> -#define __strcasecmp __strcasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> deleted file mode 100644
> index 1b7fa33c91..0000000000
> --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> +++ /dev/null
> @@ -1,5 +0,0 @@
> -#if IS_IN (libc)
> -# define USE_SSSE3 1
> -# define STRCMP __strcmp_ssse3
> -# include "../strcmp.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> index 68cb73baad..a248c2a6e6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.c
> +++ b/sysdeps/x86_64/multiarch/strcmp.c
> @@ -28,7 +28,6 @@
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
>
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> deleted file mode 100644
> index 6728678688..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strncasecmp_l_ssse3
> -#define __strncasecmp __strncasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> deleted file mode 100644
> index ec37308347..0000000000
> --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strcmp optimized with SSSE3.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#define STRCMP __strncmp_ssse3
> -
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strcmp)
> -
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCMP
> -#include <sysdeps/x86_64/strcmp.S>
> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> index fca74199d8..70ae6547c9 100644
> --- a/sysdeps/x86_64/multiarch/strncmp.c
> +++ b/sysdeps/x86_64/multiarch/strncmp.c
> @@ -27,7 +27,6 @@
>  # include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
>      return OPTIMIZE (sse42);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index 99d8b36f1d..c38dc627f9 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -59,12 +59,7 @@
>  # endif
>  #endif
>
> -#ifndef USE_SSSE3
>         .text
> -#else
> -       .section .text.ssse3,"ax",@progbits
> -#endif
> -
>  #ifdef USE_AS_STRCASECMP_L
>  # ifndef ENTRY2
>  #  define ENTRY2(name) ENTRY (name)
> @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4             /* store for next cycle */
>
> -#ifndef USE_SSSE3
>         psrldq  $1, %xmm3
>         pslldq  $15, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4            /* store for next cycle */
>
> -#ifndef USE_SSSE3
>         psrldq  $1, %xmm3
>         pslldq  $15, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $2, %xmm3
>         pslldq  $14, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $2, %xmm3
>         pslldq  $14, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $3, %xmm3
>         pslldq  $13, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $3, %xmm3
>         pslldq  $13, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $4, %xmm3
>         pslldq  $12, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $4, %xmm3
>         pslldq  $12, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $5, %xmm3
>         pslldq  $11, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $5, %xmm3
>         pslldq  $11, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $6, %xmm3
>         pslldq  $10, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $6, %xmm3
>         pslldq  $10, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $7, %xmm3
>         pslldq  $9, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $7, %xmm3
>         pslldq  $9, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $8, %xmm3
>         pslldq  $8, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $8, %xmm3
>         pslldq  $8, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $9, %xmm3
>         pslldq  $7, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $9, %xmm3
>         pslldq  $7, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $10, %xmm3
>         pslldq  $6, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $10, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $10, %xmm3
>         pslldq  $6, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $10, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $11, %xmm3
>         pslldq  $5, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $11, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $11, %xmm3
>         pslldq  $5, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $11, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $12, %xmm3
>         pslldq  $4, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $12, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $12, %xmm3
>         pslldq  $4, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $12, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $13, %xmm3
>         pslldq  $3, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $13, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $13, %xmm3
>         pslldq  $3, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $13, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $14, %xmm3
>         pslldq  $2, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $14, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $14, %xmm3
>         pslldq  $2, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $14, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $15, %xmm3
>         pslldq  $1, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $15, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $15, %xmm3
>         pslldq  $1, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $15, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-03-25 19:55   ` H.J. Lu
@ 2022-03-25 20:44   ` Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                       ` (4 more replies)
  2022-04-10  0:42   ` [PATCH v3 1/6] " Noah Goldstein
                     ` (7 subsequent siblings)
  9 siblings, 5 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
 sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
 5 files changed, 2006 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
   memcmp-evex-movbe \
   memcmp-sse2 \
   memcmp-sse4 \
-  memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
   wmemcmp-c \
   wmemcmp-evex-movbe \
   wmemcmp-sse4 \
-  wmemcmp-ssse3 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_ssse3
-# endif
-
-/* Warning!
-	   wmemcmp has to use SIGNED comparison for elements.
-	   memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-	test	%RDX_LP, %RDX_LP
-	jz	L(equal)
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	mov	%rdx, %rcx
-	mov	%rdi, %rdx
-	cmp	$48, %rcx;
-	jae	L(48bytesormore)	/* LEN => 48  */
-
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-/* ECX >= 32.  */
-L(48bytesormore):
-	movdqu	(%rdi), %xmm3
-	movdqu	(%rsi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%rdi), %rdi
-	lea	16(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%rdx, %rdi
-	sub	%rdx, %rsi
-	add	%rdx, %rcx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
-	cmp	$8, %edx
-	jae	L(next_unaligned_table)
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$1, %edx
-	je	L(shr_1)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$3, %edx
-	je	L(shr_3)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$5, %edx
-	je	L(shr_5)
-	cmp	$6, %edx
-	je	L(shr_6)
-	jmp	L(shr_7)
-
-	.p2align 2
-L(next_unaligned_table):
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$9, %edx
-	je	L(shr_9)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$11, %edx
-	je	L(shr_11)
-	cmp	$12, %edx
-	je	L(shr_12)
-	cmp	$13, %edx
-	je	L(shr_13)
-	cmp	$14, %edx
-	je	L(shr_14)
-	jmp	L(shr_15)
-# else
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$8, %edx
-	je	L(shr_8)
-	jmp	L(shr_12)
-# endif
-
-	.p2align 4
-L(shr_0):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	jae	L(shr_0_gobble)
-	xor	%eax, %eax
-	movdqa	(%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_0_gobble):
-	movdqa	(%rsi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%rdi), %xmm0
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %rcx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%rsi), %xmm0
-	movdqa	48(%rsi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%rdi), %xmm0
-	pcmpeqb	48(%rdi), %xmm2
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %rcx
-	jge	L(next)
-	inc	%edx
-	add	$32, %rcx
-L(next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_1):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_1_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$1, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$1, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_1_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$1, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$1, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$1, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_1_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_1_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_1_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	1(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-
-	.p2align 4
-L(shr_2):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$2, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_2_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$2, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$2, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$2, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	2(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_3_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$3, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$3, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$3, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$3, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$3, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_3_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_3_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_3_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	3(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_4):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$4, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_4_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$4, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$4, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$4, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	4(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_5):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_5_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$5, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$5, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_5_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$5, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$5, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$5, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_5_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_5_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_5_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	5(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$6, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$6, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$6, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$6, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	6(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_7_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$7, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$7, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$7, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$7, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$7, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_7_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_7_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_7_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	7(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_8):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$8, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_8_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$8, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$8, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$8, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	8(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_9):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_9_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$9, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$9, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_9_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$9, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$9, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$9, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_9_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_9_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_9_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	9(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$10, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$10, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$10, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$10, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	10(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_11_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$11, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$11, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$11, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$11, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$11, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_11_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_11_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_11_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	11(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_12):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$12, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_12_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$12, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$12, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$12, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	12(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_13):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_13_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$13, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$13, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_13_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$13, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$13, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$13, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_13_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_13_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_13_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	13(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$14, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$14, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$14, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$14, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	14(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_15_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$15, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$15, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$15, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$15, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$15, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_15_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_15_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_15_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	15(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-# endif
-	.p2align 4
-L(exit):
-	pmovmskb %xmm1, %r8d
-	sub	$0xffff, %r8d
-	jz	L(first16bytes)
-	lea	-16(%rsi), %rsi
-	lea	-16(%rdi), %rdi
-	mov	%r8d, %edx
-L(first16bytes):
-	add	%rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
-	test	%dl, %dl
-	jz	L(next_24_bytes)
-
-	test	$0x01, %dl
-	jnz	L(Byte16)
-
-	test	$0x02, %dl
-	jnz	L(Byte17)
-
-	test	$0x04, %dl
-	jnz	L(Byte18)
-
-	test	$0x08, %dl
-	jnz	L(Byte19)
-
-	test	$0x10, %dl
-	jnz	L(Byte20)
-
-	test	$0x20, %dl
-	jnz	L(Byte21)
-
-	test	$0x40, %dl
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte16):
-	movzbl	-16(%rdi), %eax
-	movzbl	-16(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte17):
-	movzbl	-15(%rdi), %eax
-	movzbl	-15(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte18):
-	movzbl	-14(%rdi), %eax
-	movzbl	-14(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte19):
-	movzbl	-13(%rdi), %eax
-	movzbl	-13(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte20):
-	movzbl	-12(%rdi), %eax
-	movzbl	-12(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte21):
-	movzbl	-11(%rdi), %eax
-	movzbl	-11(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte22):
-	movzbl	-10(%rdi), %eax
-	movzbl	-10(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(next_24_bytes):
-	lea	8(%rdi), %rdi
-	lea	8(%rsi), %rsi
-	test	$0x01, %dh
-	jnz	L(Byte16)
-
-	test	$0x02, %dh
-	jnz	L(Byte17)
-
-	test	$0x04, %dh
-	jnz	L(Byte18)
-
-	test	$0x08, %dh
-	jnz	L(Byte19)
-
-	test	$0x10, %dh
-	jnz	L(Byte20)
-
-	test	$0x20, %dh
-	jnz	L(Byte21)
-
-	test	$0x40, %dh
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-# else
-/* special for wmemcmp */
-	xor	%eax, %eax
-	test	%dl, %dl
-	jz	L(next_two_double_words)
-	and	$15, %dl
-	jz	L(second_double_word)
-	mov	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(second_double_word):
-	mov	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(next_two_double_words):
-	and	$15, %dh
-	jz	L(fourth_double_word)
-	mov	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(fourth_double_word):
-	mov	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-	ret
-# endif
-
-	.p2align 4
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$0, %ecx
-	je	L(0bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %ecx
-	je	L(1bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-# else
-	jmp	L(4bytes)
-# endif
-
-	.p2align 4
-L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$9, %ecx
-	je	L(9bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$11, %ecx
-	je	L(11bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	cmp	$13, %ecx
-	je	L(13bytes)
-	cmp	$14, %ecx
-	je	L(14bytes)
-	jmp	L(15bytes)
-# else
-	jmp	L(12bytes)
-# endif
-
-	.p2align 4
-L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$17, %ecx
-	je	L(17bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$19, %ecx
-	je	L(19bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	cmp	$21, %ecx
-	je	L(21bytes)
-	cmp	$22, %ecx
-	je	L(22bytes)
-	jmp	L(23bytes)
-# else
-	jmp	L(20bytes)
-# endif
-
-	.p2align 4
-L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$25, %ecx
-	je	L(25bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$27, %ecx
-	je	L(27bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	cmp	$29, %ecx
-	je	L(29bytes)
-	cmp	$30, %ecx
-	je	L(30bytes)
-	jmp	L(31bytes)
-# else
-	jmp	L(28bytes)
-# endif
-
-	.p2align 4
-L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$33, %ecx
-	je	L(33bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$35, %ecx
-	je	L(35bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	cmp	$37, %ecx
-	je	L(37bytes)
-	cmp	$38, %ecx
-	je	L(38bytes)
-	jmp	L(39bytes)
-# else
-	jmp	L(36bytes)
-# endif
-
-	.p2align 4
-L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$41, %ecx
-	je	L(41bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$43, %ecx
-	je	L(43bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	cmp	$45, %ecx
-	je	L(45bytes)
-	cmp	$46, %ecx
-	je	L(46bytes)
-	jmp	L(47bytes)
-
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	movl	-44(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	movl	-40(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	movl	-36(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	movl	-32(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	movl	-28(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	movl	-24(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	movl	-20(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	movl	-16(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	movl	-12(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	movl	-8(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	movl	-4(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# else
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	cmp	-44(%rsi), %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	cmp	-40(%rsi), %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	cmp	-36(%rsi), %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	cmp	-32(%rsi), %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	cmp	-28(%rsi), %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	cmp	-24(%rsi), %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	cmp	-20(%rsi), %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(45bytes):
-	movl	-45(%rdi), %eax
-	movl	-45(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(41bytes):
-	movl	-41(%rdi), %eax
-	movl	-41(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(37bytes):
-	movl	-37(%rdi), %eax
-	movl	-37(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(33bytes):
-	movl	-33(%rdi), %eax
-	movl	-33(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(29bytes):
-	movl	-29(%rdi), %eax
-	movl	-29(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(25bytes):
-	movl	-25(%rdi), %eax
-	movl	-25(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(21bytes):
-	movl	-21(%rdi), %eax
-	movl	-21(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(17bytes):
-	movl	-17(%rdi), %eax
-	movl	-17(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(13bytes):
-	movl	-13(%rdi), %eax
-	movl	-13(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(9bytes):
-	movl	-9(%rdi), %eax
-	movl	-9(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(5bytes):
-	movl	-5(%rdi), %eax
-	movl	-5(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(46bytes):
-	movl	-46(%rdi), %eax
-	movl	-46(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(42bytes):
-	movl	-42(%rdi), %eax
-	movl	-42(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(38bytes):
-	movl	-38(%rdi), %eax
-	movl	-38(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(34bytes):
-	movl	-34(%rdi), %eax
-	movl	-34(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(30bytes):
-	movl	-30(%rdi), %eax
-	movl	-30(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(26bytes):
-	movl	-26(%rdi), %eax
-	movl	-26(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(22bytes):
-	movl	-22(%rdi), %eax
-	movl	-22(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(18bytes):
-	movl	-18(%rdi), %eax
-	movl	-18(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(14bytes):
-	movl	-14(%rdi), %eax
-	movl	-14(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(10bytes):
-	movl	-10(%rdi), %eax
-	movl	-10(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(6bytes):
-	movl	-6(%rdi), %eax
-	movl	-6(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(2bytes):
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(47bytes):
-	movl	-47(%rdi), %eax
-	movl	-47(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(43bytes):
-	movl	-43(%rdi), %eax
-	movl	-43(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(39bytes):
-	movl	-39(%rdi), %eax
-	movl	-39(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(35bytes):
-	movl	-35(%rdi), %eax
-	movl	-35(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(31bytes):
-	movl	-31(%rdi), %eax
-	movl	-31(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(27bytes):
-	movl	-27(%rdi), %eax
-	movl	-27(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(23bytes):
-	movl	-23(%rdi), %eax
-	movl	-23(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(19bytes):
-	movl	-19(%rdi), %eax
-	movl	-19(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(15bytes):
-	movl	-15(%rdi), %eax
-	movl	-15(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(11bytes):
-	movl	-11(%rdi), %eax
-	movl	-11(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(7bytes):
-	movl	-7(%rdi), %eax
-	movl	-7(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(find_diff):
-	cmpb	%cl, %al
-	jne	L(set)
-	cmpw	%cx, %ax
-	jne	L(set)
-	shr	$16, %eax
-	shr	$16, %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-
-/* We get there only if we already know there is a
-difference.  */
-
-	cmp	%ecx, %eax
-L(set):
-	sbb	%eax, %eax
-	sbb	$-1, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	.p2align 4
-L(find_diff):
-	mov	$1, %eax
-	jg	L(find_diff_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(find_diff_bigger):
-	ret
-# endif
-
-	.p2align 4
-L(equal):
-	xor	%eax, %eax
-	ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-03-25 20:44     ` Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
                       ` (3 subsequent siblings)
  4 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |   4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
 sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
 sysdeps/x86_64/multiarch/strcmp.c             |   4 -
 sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
 sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
 sysdeps/x86_64/multiarch/strncmp.c            |   4 -
 sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
 10 files changed, 30 insertions(+), 202 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
   strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
-  strcasecmp_l-ssse3 \
   strcat-avx2 \
   strcat-avx2-rtm \
   strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
-  strcmp-ssse3 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
   strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
-  strncase_l-ssse3 \
   strncat-avx2 \
   strncat-avx2-rtm \
   strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncmp-evex \
   strncmp-sse2 \
   strncmp-sse4_2 \
-  strncmp-ssse3 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
   strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 
 #ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
 # endif
 #endif
 
-#ifndef USE_SSSE3
 	.text
-#else
-	.section .text.ssse3,"ax",@progbits
-#endif
-
 #ifdef USE_AS_STRCASECMP_L
 # ifndef ENTRY2
 #  define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 20:44     ` Noah Goldstein
  2022-04-10  0:57       ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
                       ` (2 subsequent siblings)
  4 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
 5 files changed, 7 insertions(+), 3183 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..48f81711ae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
@@ -24,7 +23,6 @@ sysdep_routines += \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3 \
   memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..70b0e9c62e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..1ecdd4b0d3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
+        return OPTIMIZE (ssse3_back);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
  2022-03-25 20:44     ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-04-10  0:57       ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:57 UTC (permalink / raw)
  To: libc-alpha

The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.

This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.

Aside from this function all other SSSE3 functions should be safe to
remove.

The performance is not changed drastically although shows overall
improvements without any major regressions or gains.

bench-memcpy geometric_mean(N=50) New / Original: 0.962

bench-memcpy-random geometric_mean(N=50) New / Original: 0.895

bench-memcpy-large geometric_mean(N=50) New / Original: 0.894

Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.

More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
memcpy benchmarks comparisong memcpy-ssse3 before / after this
patch. Results are from geomtric mean of N=50 runs on Zhaoxin
KX-6840@2000MHz.

bench-memcpy:
length, align1, align2, dst > src, New Time / Old Time
     1,      0,      0,         0,               2.099
     1,      0,      0,         1,               2.099
     1,     32,      0,         0,               2.103
     1,     32,      0,         1,               2.103
     1,      0,     32,         0,               2.099
     1,      0,     32,         1,               2.098
     1,     32,     32,         0,               2.098
     1,     32,     32,         1,               2.098
     1,   2048,      0,         0,               2.098
     1,   2048,      0,         1,               2.098
     2,      0,      0,         0,               1.135
     2,      0,      0,         1,               1.136
     2,      1,      0,         0,               1.139
     2,      1,      0,         1,               1.139
     2,     33,      0,         0,               1.165
     2,     33,      0,         1,               1.139
     2,      0,      1,         0,               1.136
     2,      0,      1,         1,               1.136
     2,      0,     33,         0,               1.136
     2,      0,     33,         1,               1.136
     2,      1,      1,         0,               1.136
     2,      1,      1,         1,               1.136
     2,     33,     33,         0,               1.136
     2,     33,     33,         1,               1.136
     2,   2048,      0,         0,               1.136
     2,   2048,      0,         1,               1.136
     2,   2049,      0,         0,               1.191
     2,   2049,      0,         1,               1.139
     2,   2048,      1,         0,               1.136
     2,   2048,      1,         1,               1.136
     2,   2049,      1,         0,               1.136
     2,   2049,      1,         1,               1.136
     4,      0,      0,         0,               1.074
     4,      0,      0,         1,               0.962
     4,      2,      0,         0,               0.973
     4,      2,      0,         1,               0.989
     4,     34,      0,         0,               0.991
     4,     34,      0,         1,               0.991
     4,      0,      2,         0,               0.962
     4,      0,      2,         1,               0.962
     4,      0,     34,         0,               0.962
     4,      0,     34,         1,               0.962
     4,      2,      2,         0,               0.962
     4,      2,      2,         1,               0.962
     4,     34,     34,         0,               0.962
     4,     34,     34,         1,               0.962
     4,   2048,      0,         0,               0.962
     4,   2048,      0,         1,               0.962
     4,   2050,      0,         0,               0.977
     4,   2050,      0,         1,               0.979
     4,   2048,      2,         0,               0.962
     4,   2048,      2,         1,               0.962
     4,   2050,      2,         0,               0.962
     4,   2050,      2,         1,               0.962
     8,      0,      0,         0,               0.961
     8,      0,      0,         1,               0.962
     8,      3,      0,         0,                 1.0
     8,      3,      0,         1,                 1.0
     8,     35,      0,         0,                 1.0
     8,     35,      0,         1,                 1.0
     8,      0,      3,         0,               0.962
     8,      0,      3,         1,               0.962
     8,      0,     35,         0,               0.962
     8,      0,     35,         1,               0.962
     8,      3,      3,         0,               0.962
     8,      3,      3,         1,               0.962
     8,     35,     35,         0,               0.962
     8,     35,     35,         1,               0.962
     8,   2048,      0,         0,               0.962
     8,   2048,      0,         1,               0.962
     8,   2051,      0,         0,                 1.0
     8,   2051,      0,         1,                 1.0
     8,   2048,      3,         0,               0.962
     8,   2048,      3,         1,               0.962
     8,   2051,      3,         0,               0.962
     8,   2051,      3,         1,               0.962
    16,      0,      0,         0,               0.798
    16,      0,      0,         1,               0.799
    16,      4,      0,         0,                 0.8
    16,      4,      0,         1,               0.801
    16,     36,      0,         0,               0.801
    16,     36,      0,         1,                 0.8
    16,      0,      4,         0,               0.798
    16,      0,      4,         1,               0.798
    16,      0,     36,         0,               0.798
    16,      0,     36,         1,               0.798
    16,      4,      4,         0,               0.798
    16,      4,      4,         1,               0.798
    16,     36,     36,         0,               0.798
    16,     36,     36,         1,               0.798
    16,   2048,      0,         0,               0.798
    16,   2048,      0,         1,               0.799
    16,   2052,      0,         0,                 0.8
    16,   2052,      0,         1,                 0.8
    16,   2048,      4,         0,               0.798
    16,   2048,      4,         1,               0.798
    16,   2052,      4,         0,               0.798
    16,   2052,      4,         1,               0.798
    32,      0,      0,         0,               0.471
    32,      0,      0,         1,               0.471
    32,      5,      0,         0,               0.471
    32,      5,      0,         1,               0.471
    32,     37,      0,         0,               0.961
    32,     37,      0,         1,               0.961
    32,      0,      5,         0,               0.471
    32,      0,      5,         1,               0.471
    32,      0,     37,         0,               1.021
    32,      0,     37,         1,               1.021
    32,      5,      5,         0,               0.471
    32,      5,      5,         1,               0.471
    32,     37,     37,         0,               1.011
    32,     37,     37,         1,               1.011
    32,   2048,      0,         0,               0.471
    32,   2048,      0,         1,               0.471
    32,   2053,      0,         0,               0.471
    32,   2053,      0,         1,               0.471
    32,   2048,      5,         0,               0.471
    32,   2048,      5,         1,               0.471
    32,   2053,      5,         0,               0.471
    32,   2053,      5,         1,               0.471
    64,      0,      0,         0,                 1.0
    64,      0,      0,         1,                 1.0
    64,      6,      0,         0,               0.862
    64,      6,      0,         1,               0.862
    64,     38,      0,         0,               0.912
    64,     38,      0,         1,               0.912
    64,      0,      6,         0,               0.896
    64,      0,      6,         1,               0.896
    64,      0,     38,         0,               0.906
    64,      0,     38,         1,               0.906
    64,      6,      6,         0,                0.91
    64,      6,      6,         1,                0.91
    64,     38,     38,         0,               0.883
    64,     38,     38,         1,               0.883
    64,   2048,      0,         0,                 1.0
    64,   2048,      0,         1,                 1.0
    64,   2054,      0,         0,               0.862
    64,   2054,      0,         1,               0.862
    64,   2048,      6,         0,               0.887
    64,   2048,      6,         1,               0.887
    64,   2054,      6,         0,               0.887
    64,   2054,      6,         1,               0.887
   128,      0,      0,         0,               0.857
   128,      0,      0,         1,               0.857
   128,      7,      0,         0,               0.875
   128,      7,      0,         1,               0.875
   128,     39,      0,         0,               0.892
   128,     39,      0,         1,               0.892
   128,      0,      7,         0,               1.183
   128,      0,      7,         1,               1.183
   128,      0,     39,         0,               1.113
   128,      0,     39,         1,               1.113
   128,      7,      7,         0,               0.692
   128,      7,      7,         1,               0.692
   128,     39,     39,         0,               1.104
   128,     39,     39,         1,               1.104
   128,   2048,      0,         0,               0.857
   128,   2048,      0,         1,               0.857
   128,   2055,      0,         0,               0.875
   128,   2055,      0,         1,               0.875
   128,   2048,      7,         0,               0.959
   128,   2048,      7,         1,               0.959
   128,   2055,      7,         0,               1.036
   128,   2055,      7,         1,               1.036
   256,      0,      0,         0,               0.889
   256,      0,      0,         1,               0.889
   256,      8,      0,         0,               0.966
   256,      8,      0,         1,               0.966
   256,     40,      0,         0,               0.983
   256,     40,      0,         1,               0.983
   256,      0,      8,         0,                1.29
   256,      0,      8,         1,                1.29
   256,      0,     40,         0,               1.274
   256,      0,     40,         1,               1.274
   256,      8,      8,         0,               0.865
   256,      8,      8,         1,               0.865
   256,     40,     40,         0,               1.477
   256,     40,     40,         1,               1.477
   256,   2048,      0,         0,               0.889
   256,   2048,      0,         1,               0.889
   256,   2056,      0,         0,               0.966
   256,   2056,      0,         1,               0.966
   256,   2048,      8,         0,               0.952
   256,   2048,      8,         1,               0.952
   256,   2056,      8,         0,               0.878
   256,   2056,      8,         1,               0.878
   512,      0,      0,         0,               1.077
   512,      0,      0,         1,               1.077
   512,      9,      0,         0,               1.001
   512,      9,      0,         1,                 1.0
   512,     41,      0,         0,               0.954
   512,     41,      0,         1,               0.954
   512,      0,      9,         0,               1.191
   512,      0,      9,         1,               1.191
   512,      0,     41,         0,               1.181
   512,      0,     41,         1,               1.181
   512,      9,      9,         0,               0.765
   512,      9,      9,         1,               0.765
   512,     41,     41,         0,               0.905
   512,     41,     41,         1,               0.905
   512,   2048,      0,         0,               1.077
   512,   2048,      0,         1,               1.077
   512,   2057,      0,         0,                 1.0
   512,   2057,      0,         1,                 1.0
   512,   2048,      9,         0,                 1.0
   512,   2048,      9,         1,                 1.0
   512,   2057,      9,         0,               0.733
   512,   2057,      9,         1,               0.733
  1024,      0,      0,         0,               1.143
  1024,      0,      0,         1,               1.143
  1024,     10,      0,         0,               1.015
  1024,     10,      0,         1,               1.015
  1024,     42,      0,         0,               1.045
  1024,     42,      0,         1,               1.045
  1024,      0,     10,         0,               1.126
  1024,      0,     10,         1,               1.126
  1024,      0,     42,         0,               1.114
  1024,      0,     42,         1,               1.114
  1024,     10,     10,         0,                0.89
  1024,     10,     10,         1,                0.89
  1024,     42,     42,         0,               0.986
  1024,     42,     42,         1,               0.986
  1024,   2048,      0,         0,               1.143
  1024,   2048,      0,         1,               1.143
  1024,   2058,      0,         0,               1.015
  1024,   2058,      0,         1,               1.015
  1024,   2048,     10,         0,                1.03
  1024,   2048,     10,         1,                1.03
  1024,   2058,     10,         0,               0.854
  1024,   2058,     10,         1,               0.854
  2048,      0,      0,         0,               1.005
  2048,      0,      0,         1,               1.005
  2048,     11,      0,         0,               1.013
  2048,     11,      0,         1,               1.014
  2048,     43,      0,         0,               1.044
  2048,     43,      0,         1,               1.044
  2048,      0,     11,         0,               1.003
  2048,      0,     11,         1,               1.003
  2048,      0,     43,         0,               1.003
  2048,      0,     43,         1,               1.003
  2048,     11,     11,         0,                0.92
  2048,     11,     11,         1,                0.92
  2048,     43,     43,         0,                 1.0
  2048,     43,     43,         1,                 1.0
  2048,   2048,      0,         0,               1.005
  2048,   2048,      0,         1,               1.005
  2048,   2059,      0,         0,               0.904
  2048,   2059,      0,         1,               0.904
  2048,   2048,     11,         0,                 1.0
  2048,   2048,     11,         1,                 1.0
  2048,   2059,     11,         0,               0.979
  2048,   2059,     11,         1,               0.979
  4096,      0,      0,         0,               1.014
  4096,      0,      0,         1,               1.014
  4096,     12,      0,         0,               0.855
  4096,     12,      0,         1,               0.855
  4096,     44,      0,         0,               0.857
  4096,     44,      0,         1,               0.857
  4096,      0,     12,         0,               0.932
  4096,      0,     12,         1,               0.932
  4096,      0,     44,         0,               0.932
  4096,      0,     44,         1,               0.932
  4096,     12,     12,         0,               0.999
  4096,     12,     12,         1,               0.999
  4096,     44,     44,         0,               1.051
  4096,     44,     44,         1,               1.051
  4096,   2048,      0,         0,               1.014
  4096,   2048,      0,         1,               1.014
  4096,   2060,      0,         0,                0.98
  4096,   2060,      0,         1,                0.98
  4096,   2048,     12,         0,                0.77
  4096,   2048,     12,         1,                0.77
  4096,   2060,     12,         0,               0.943
  4096,   2060,     12,         1,               0.943
  8192,      0,      0,         0,               1.046
  8192,      0,      0,         1,               1.046
  8192,     13,      0,         0,               0.885
  8192,     13,      0,         1,               0.885
  8192,     45,      0,         0,               0.887
  8192,     45,      0,         1,               0.886
  8192,      0,     13,         0,               0.942
  8192,      0,     13,         1,               0.942
  8192,      0,     45,         0,               0.942
  8192,      0,     45,         1,               0.942
  8192,     13,     13,         0,                1.03
  8192,     13,     13,         1,                1.03
  8192,     45,     45,         0,               1.048
  8192,     45,     45,         1,               1.048
  8192,   2048,      0,         0,               1.048
  8192,   2048,      0,         1,               1.048
  8192,   2061,      0,         0,               1.011
  8192,   2061,      0,         1,               1.011
  8192,   2048,     13,         0,               0.789
  8192,   2048,     13,         1,               0.789
  8192,   2061,     13,         0,               0.991
  8192,   2061,     13,         1,               0.991
 16384,      0,      0,         0,               1.014
 16384,      0,      0,         1,               1.008
 16384,     14,      0,         0,               0.951
 16384,     14,      0,         1,                0.95
 16384,     46,      0,         0,               0.874
 16384,     46,      0,         1,               0.871
 16384,      0,     14,         0,               0.813
 16384,      0,     14,         1,                0.81
 16384,      0,     46,         0,                0.85
 16384,      0,     46,         1,                0.86
 16384,     14,     14,         0,               0.985
 16384,     14,     14,         1,               0.975
 16384,     46,     46,         0,               1.025
 16384,     46,     46,         1,               1.027
 16384,   2048,      0,         0,               1.058
 16384,   2048,      0,         1,               1.058
 16384,   2062,      0,         0,               0.849
 16384,   2062,      0,         1,               0.848
 16384,   2048,     14,         0,               0.907
 16384,   2048,     14,         1,               0.907
 16384,   2062,     14,         0,               0.988
 16384,   2062,     14,         1,               0.995
 32768,      0,      0,         0,               0.979
 32768,      0,      0,         1,               0.979
 32768,     15,      0,         0,               1.006
 32768,     15,      0,         1,               1.006
 32768,     47,      0,         0,               1.004
 32768,     47,      0,         1,               1.004
 32768,      0,     15,         0,               1.045
 32768,      0,     15,         1,               1.045
 32768,      0,     47,         0,               1.011
 32768,      0,     47,         1,               1.012
 32768,     15,     15,         0,               0.977
 32768,     15,     15,         1,               0.977
 32768,     47,     47,         0,                0.96
 32768,     47,     47,         1,                0.96
 32768,   2048,      0,         0,               0.978
 32768,   2048,      0,         1,               0.978
 32768,   2063,      0,         0,               1.004
 32768,   2063,      0,         1,               1.004
 32768,   2048,     15,         0,               1.036
 32768,   2048,     15,         1,               1.036
 32768,   2063,     15,         0,               0.978
 32768,   2063,     15,         1,               0.978
 65536,      0,      0,         0,               0.981
 65536,      0,      0,         1,               0.981
 65536,     16,      0,         0,               0.987
 65536,     16,      0,         1,               0.987
 65536,     48,      0,         0,               0.968
 65536,     48,      0,         1,               0.968
 65536,      0,     16,         0,               1.014
 65536,      0,     16,         1,               1.014
 65536,      0,     48,         0,               0.984
 65536,      0,     48,         1,               0.984
 65536,     16,     16,         0,                1.01
 65536,     16,     16,         1,                1.01
 65536,     48,     48,         0,               0.968
 65536,     48,     48,         1,               0.968
 65536,   2048,      0,         0,               0.982
 65536,   2048,      0,         1,               0.982
 65536,   2064,      0,         0,               0.987
 65536,   2064,      0,         1,               0.987
 65536,   2048,     16,         0,               1.012
 65536,   2048,     16,         1,               1.012
 65536,   2064,     16,         0,               1.007
 65536,   2064,     16,         1,               1.007
     0,      0,      0,         0,               2.104
     0,   2048,      0,         0,               2.104
     0,   4095,      0,         0,               2.109
     0,      0,   4095,         0,               2.103
     1,      1,      0,         0,               2.104
     1,      0,      1,         0,               2.098
     1,      1,      1,         0,               2.098
     1,   2049,      0,         0,               2.102
     1,   2048,      1,         0,               2.098
     1,   2049,      1,         0,               2.098
     1,   4095,      0,         0,               2.103
     1,      0,   4095,         0,               2.098
     2,      2,      0,         0,               1.139
     2,      0,      2,         0,               1.136
     2,      2,      2,         0,               1.136
     2,   2050,      0,         0,               1.139
     2,   2048,      2,         0,               1.136
     2,   2050,      2,         0,               1.136
     2,   4095,      0,         0,                 1.0
     2,      0,   4095,         0,               1.022
     3,      0,      0,         0,               0.981
     3,      3,      0,         0,               0.984
     3,      0,      3,         0,               0.982
     3,      3,      3,         0,               0.982
     3,   2048,      0,         0,               0.982
     3,   2051,      0,         0,               0.983
     3,   2048,      3,         0,               0.982
     3,   2051,      3,         0,               0.982
     3,   4095,      0,         0,               0.285
     3,      0,   4095,         0,               0.231
     4,      4,      0,         0,               1.373
     4,      0,      4,         0,                1.31
     4,      4,      4,         0,               1.282
     4,   2052,      0,         0,               1.264
     4,   2048,      4,         0,               1.254
     4,   2052,      4,         0,               1.254
     4,   4095,      0,         0,               1.971
     4,      0,   4095,         0,               1.994
     5,      0,      0,         0,               1.145
     5,      5,      0,         0,               1.155
     5,      0,      5,         0,               1.171
     5,      5,      5,         0,               1.171
     5,   2048,      0,         0,               1.197
     5,   2053,      0,         0,               1.173
     5,   2048,      5,         0,               1.171
     5,   2053,      5,         0,               1.171
     5,   4095,      0,         0,               0.935
     5,      0,   4095,         0,               1.017
     6,      0,      0,         0,               1.145
     6,      6,      0,         0,               1.098
     6,      0,      6,         0,               1.096
     6,      6,      6,         0,               1.096
     6,   2048,      0,         0,                1.12
     6,   2054,      0,         0,               1.122
     6,   2048,      6,         0,                1.12
     6,   2054,      6,         0,               1.096
     6,   4095,      0,         0,               0.935
     6,      0,   4095,         0,               1.018
     7,      0,      0,         0,               1.071
     7,      7,      0,         0,               1.074
     7,      0,      7,         0,               1.072
     7,      7,      7,         0,               1.072
     7,   2048,      0,         0,               1.096
     7,   2055,      0,         0,               1.098
     7,   2048,      7,         0,               1.096
     7,   2055,      7,         0,               1.096
     7,   4095,      0,         0,               0.935
     7,      0,   4095,         0,               1.016
     8,      8,      0,         0,               1.167
     8,      0,      8,         0,               1.028
     8,      8,      8,         0,               1.028
     8,   2056,      0,         0,               1.069
     8,   2048,      8,         0,               1.028
     8,   2056,      8,         0,               1.028
     8,   4095,      0,         0,               1.029
     8,      0,   4095,         0,               1.043
     9,      0,      0,         0,               0.799
     9,      9,      0,         0,               0.801
     9,      0,      9,         0,               0.799
     9,      9,      9,         0,               0.799
     9,   2048,      0,         0,                 0.8
     9,   2057,      0,         0,               0.801
     9,   2048,      9,         0,                 0.8
     9,   2057,      9,         0,               0.799
     9,   4095,      0,         0,               0.909
     9,      0,   4095,         0,                 1.0
    10,      0,      0,         0,               0.799
    10,     10,      0,         0,               0.801
    10,      0,     10,         0,                 0.8
    10,     10,     10,         0,                 0.8
    10,   2048,      0,         0,                 0.8
    10,   2058,      0,         0,               0.801
    10,   2048,     10,         0,                 0.8
    10,   2058,     10,         0,                 0.8
    10,   4095,      0,         0,               0.909
    10,      0,   4095,         0,                 1.0
    11,      0,      0,         0,               0.799
    11,     11,      0,         0,               0.801
    11,      0,     11,         0,                 0.8
    11,     11,     11,         0,                 0.8
    11,   2048,      0,         0,                 0.8
    11,   2059,      0,         0,               0.802
    11,   2048,     11,         0,                 0.8
    11,   2059,     11,         0,                 0.8
    11,   4095,      0,         0,               0.909
    11,      0,   4095,         0,                 1.0
    12,      0,      0,         0,               0.799
    12,     12,      0,         0,               0.801
    12,      0,     12,         0,                 0.8
    12,     12,     12,         0,                 0.8
    12,   2048,      0,         0,                 0.8
    12,   2060,      0,         0,               0.802
    12,   2048,     12,         0,                 0.8
    12,   2060,     12,         0,                 0.8
    12,   4095,      0,         0,               0.909
    12,      0,   4095,         0,                 1.0
    13,      0,      0,         0,               0.798
    13,     13,      0,         0,               0.801
    13,      0,     13,         0,               0.799
    13,     13,     13,         0,               0.799
    13,   2048,      0,         0,                 0.8
    13,   2061,      0,         0,               0.801
    13,   2048,     13,         0,                 0.8
    13,   2061,     13,         0,                 0.8
    13,   4095,      0,         0,               0.909
    13,      0,   4095,         0,                 1.0
    14,      0,      0,         0,               0.799
    14,     14,      0,         0,               0.801
    14,      0,     14,         0,                 0.8
    14,     14,     14,         0,                 0.8
    14,   2048,      0,         0,                 0.8
    14,   2062,      0,         0,               0.801
    14,   2048,     14,         0,                 0.8
    14,   2062,     14,         0,                 0.8
    14,   4095,      0,         0,               0.909
    14,      0,   4095,         0,                 1.0
    15,      0,      0,         0,               0.799
    15,     15,      0,         0,               0.801
    15,      0,     15,         0,                 0.8
    15,     15,     15,         0,                 0.8
    15,   2048,      0,         0,                 0.8
    15,   2063,      0,         0,               0.802
    15,   2048,     15,         0,                 0.8
    15,   2063,     15,         0,                 0.8
    15,   4095,      0,         0,               0.909
    15,      0,   4095,         0,                 1.0
    16,     16,      0,         0,               0.801
    16,      0,     16,         0,               0.799
    16,     16,     16,         0,               0.799
    16,   2064,      0,         0,               0.801
    16,   2048,     16,         0,               0.798
    16,   2064,     16,         0,               0.798
    16,   4095,      0,         0,               1.818
    16,      0,   4095,         0,               1.957
    17,      0,      0,         0,               0.798
    17,     17,      0,         0,                 0.8
    17,      0,     17,         0,               0.799
    17,     17,     17,         0,               0.798
    17,   2048,      0,         0,               0.798
    17,   2065,      0,         0,                 0.8
    17,   2048,     17,         0,               0.798
    17,   2065,     17,         0,               0.799
    17,   4095,      0,         0,               0.937
    17,      0,   4095,         0,               1.021
    18,      0,      0,         0,               0.798
    18,     18,      0,         0,               0.801
    18,      0,     18,         0,               0.798
    18,     18,     18,         0,               0.798
    18,   2048,      0,         0,               0.799
    18,   2066,      0,         0,                 0.8
    18,   2048,     18,         0,               0.798
    18,   2066,     18,         0,               0.798
    18,   4095,      0,         0,               0.937
    18,      0,   4095,         0,               1.021
    19,      0,      0,         0,               0.798
    19,     19,      0,         0,                 0.8
    19,      0,     19,         0,               0.798
    19,     19,     19,         0,               0.798
    19,   2048,      0,         0,               0.798
    19,   2067,      0,         0,                 0.8
    19,   2048,     19,         0,               0.798
    19,   2067,     19,         0,               0.798
    19,   4095,      0,         0,               0.937
    19,      0,   4095,         0,               1.021
    20,      0,      0,         0,               0.798
    20,     20,      0,         0,                 0.8
    20,      0,     20,         0,               0.798
    20,     20,     20,         0,               0.798
    20,   2048,      0,         0,               0.798
    20,   2068,      0,         0,                 0.8
    20,   2048,     20,         0,               0.798
    20,   2068,     20,         0,               0.798
    20,   4095,      0,         0,               0.937
    20,      0,   4095,         0,               1.021
    21,      0,      0,         0,               0.798
    21,     21,      0,         0,               0.801
    21,      0,     21,         0,               0.798
    21,     21,     21,         0,               0.798
    21,   2048,      0,         0,               0.798
    21,   2069,      0,         0,               0.801
    21,   2048,     21,         0,               0.799
    21,   2069,     21,         0,               0.798
    21,   4095,      0,         0,               0.937
    21,      0,   4095,         0,               1.021
    22,      0,      0,         0,               0.798
    22,     22,      0,         0,               0.801
    22,      0,     22,         0,               0.798
    22,     22,     22,         0,               0.798
    22,   2048,      0,         0,               0.798
    22,   2070,      0,         0,               0.801
    22,   2048,     22,         0,               0.798
    22,   2070,     22,         0,               0.798
    22,   4095,      0,         0,               0.937
    22,      0,   4095,         0,               1.021
    23,      0,      0,         0,               0.798
    23,     23,      0,         0,                 0.8
    23,      0,     23,         0,               0.798
    23,     23,     23,         0,               0.798
    23,   2048,      0,         0,               0.798
    23,   2071,      0,         0,                 0.8
    23,   2048,     23,         0,               0.798
    23,   2071,     23,         0,               0.798
    23,   4095,      0,         0,               0.937
    23,      0,   4095,         0,               1.021
    24,      0,      0,         0,               0.798
    24,     24,      0,         0,                 0.8
    24,      0,     24,         0,               0.799
    24,     24,     24,         0,               0.798
    24,   2048,      0,         0,               0.798
    24,   2072,      0,         0,               0.801
    24,   2048,     24,         0,               0.798
    24,   2072,     24,         0,               0.798
    24,   4095,      0,         0,               0.937
    24,      0,   4095,         0,               1.021
    25,      0,      0,         0,                 0.5
    25,     25,      0,         0,                 0.5
    25,      0,     25,         0,                 0.5
    25,     25,     25,         0,                 0.5
    25,   2048,      0,         0,                 0.5
    25,   2073,      0,         0,               0.501
    25,   2048,     25,         0,                 0.5
    25,   2073,     25,         0,                 0.5
    25,   4095,      0,         0,               0.974
    25,      0,   4095,         0,                0.98
    26,      0,      0,         0,                 0.5
    26,     26,      0,         0,               0.501
    26,      0,     26,         0,                 0.5
    26,     26,     26,         0,               0.501
    26,   2048,      0,         0,                 0.5
    26,   2074,      0,         0,                 0.5
    26,   2048,     26,         0,                 0.5
    26,   2074,     26,         0,                 0.5
    26,   4095,      0,         0,               0.974
    26,      0,   4095,         0,                 1.0
    27,      0,      0,         0,                 0.5
    27,     27,      0,         0,               0.501
    27,      0,     27,         0,                 0.5
    27,     27,     27,         0,                 0.5
    27,   2048,      0,         0,                 0.5
    27,   2075,      0,         0,                 0.5
    27,   2048,     27,         0,                 0.5
    27,   2075,     27,         0,                 0.5
    27,   4095,      0,         0,               0.974
    27,      0,   4095,         0,                 1.0
    28,      0,      0,         0,                 0.5
    28,     28,      0,         0,               0.501
    28,      0,     28,         0,                 0.5
    28,     28,     28,         0,                 0.5
    28,   2048,      0,         0,                 0.5
    28,   2076,      0,         0,                 0.5
    28,   2048,     28,         0,                 0.5
    28,   2076,     28,         0,                 0.5
    28,   4095,      0,         0,               0.974
    28,      0,   4095,         0,                 1.0
    29,      0,      0,         0,               0.471
    29,     29,      0,         0,               0.471
    29,      0,     29,         0,               0.471
    29,     29,     29,         0,               0.471
    29,   2048,      0,         0,               0.471
    29,   2077,      0,         0,               0.471
    29,   2048,     29,         0,               0.471
    29,   2077,     29,         0,               0.471
    29,   4095,      0,         0,               0.974
    29,      0,   4095,         0,                 1.0
    30,      0,      0,         0,               0.471
    30,     30,      0,         0,               0.471
    30,      0,     30,         0,               0.471
    30,     30,     30,         0,               0.471
    30,   2048,      0,         0,               0.471
    30,   2078,      0,         0,               0.471
    30,   2048,     30,         0,               0.471
    30,   2078,     30,         0,               0.471
    30,   4095,      0,         0,               0.974
    30,      0,   4095,         0,                 1.0
    31,      0,      0,         0,               0.471
    31,     31,      0,         0,               0.471
    31,      0,     31,         0,               0.471
    31,     31,     31,         0,               0.471
    31,   2048,      0,         0,               0.471
    31,   2079,      0,         0,               0.471
    31,   2048,     31,         0,               0.471
    31,   2079,     31,         0,               0.471
    31,   4095,      0,         0,               0.974
    31,      0,   4095,         0,                 1.0
    48,      0,      0,         0,                 1.0
    48,      0,      0,         1,                 1.0
    48,      3,      0,         0,                 1.0
    48,      3,      0,         1,                 1.0
    48,      0,      3,         0,                 1.0
    48,      0,      3,         1,                 1.0
    48,      3,      3,         0,                 1.0
    48,      3,      3,         1,                 1.0
    48,   2048,      0,         0,                 1.0
    48,   2048,      0,         1,                 1.0
    48,   2051,      0,         0,                 1.0
    48,   2051,      0,         1,                 1.0
    48,   2048,      3,         0,                 1.0
    48,   2048,      3,         1,                 1.0
    48,   2051,      3,         0,                 1.0
    48,   2051,      3,         1,                 1.0
    80,      0,      0,         0,               0.781
    80,      0,      0,         1,               0.782
    80,      5,      0,         0,               0.976
    80,      5,      0,         1,               0.976
    80,      0,      5,         0,               1.232
    80,      0,      5,         1,               1.232
    80,      5,      5,         0,               1.542
    80,      5,      5,         1,               1.543
    80,   2048,      0,         0,               0.781
    80,   2048,      0,         1,               0.782
    80,   2053,      0,         0,               0.976
    80,   2053,      0,         1,               0.976
    80,   2048,      5,         0,               1.093
    80,   2048,      5,         1,               1.093
    80,   2053,      5,         0,               1.371
    80,   2053,      5,         1,               1.371
    96,      0,      0,         0,               0.758
    96,      0,      0,         1,               0.758
    96,      6,      0,         0,               0.929
    96,      6,      0,         1,               0.929
    96,      0,      6,         0,               1.204
    96,      0,      6,         1,               1.204
    96,      6,      6,         0,               1.562
    96,      6,      6,         1,               1.562
    96,   2048,      0,         0,               0.758
    96,   2048,      0,         1,               0.758
    96,   2054,      0,         0,               0.929
    96,   2054,      0,         1,               0.929
    96,   2048,      6,         0,               1.068
    96,   2048,      6,         1,               1.068
    96,   2054,      6,         0,               1.562
    96,   2054,      6,         1,               1.562
   112,      0,      0,         0,               0.736
   112,      0,      0,         1,               0.736
   112,      7,      0,         0,               0.675
   112,      7,      0,         1,               0.675
   112,      0,      7,         0,               0.778
   112,      0,      7,         1,               0.778
   112,      7,      7,         0,               0.909
   112,      7,      7,         1,               0.909
   112,   2048,      0,         0,               0.736
   112,   2048,      0,         1,               0.736
   112,   2055,      0,         0,               0.675
   112,   2055,      0,         1,               0.675
   112,   2048,      7,         0,               0.778
   112,   2048,      7,         1,               0.778
   112,   2055,      7,         0,               0.909
   112,   2055,      7,         1,               0.909
   144,      0,      0,         0,               0.857
   144,      0,      0,         1,               0.857
   144,      9,      0,         0,               0.941
   144,      9,      0,         1,               0.943
   144,      0,      9,         0,               1.137
   144,      0,      9,         1,               1.137
   144,      9,      9,         0,               1.514
   144,      9,      9,         1,               1.514
   144,   2048,      0,         0,               0.857
   144,   2048,      0,         1,               0.857
   144,   2057,      0,         0,               0.939
   144,   2057,      0,         1,               0.945
   144,   2048,      9,         0,               0.922
   144,   2048,      9,         1,               0.922
   144,   2057,      9,         0,               1.514
   144,   2057,      9,         1,               1.514
   160,      0,      0,         0,               0.698
   160,      0,      0,         1,               0.698
   160,     10,      0,         0,                0.91
   160,     10,      0,         1,                0.91
   160,      0,     10,         0,               1.211
   160,      0,     10,         1,               1.212
   160,     10,     10,         0,               1.357
   160,     10,     10,         1,               1.357
   160,   2048,      0,         0,               0.698
   160,   2048,      0,         1,               0.698
   160,   2058,      0,         0,                0.91
   160,   2058,      0,         1,                0.91
   160,   2048,     10,         0,               0.923
   160,   2048,     10,         1,               0.923
   160,   2058,     10,         0,               1.357
   160,   2058,     10,         1,               1.357
   176,      0,      0,         0,               0.796
   176,      0,      0,         1,               0.796
   176,     11,      0,         0,               0.804
   176,     11,      0,         1,               0.804
   176,      0,     11,         0,               0.774
   176,      0,     11,         1,               0.774
   176,     11,     11,         0,               0.814
   176,     11,     11,         1,               0.814
   176,   2048,      0,         0,               0.796
   176,   2048,      0,         1,               0.796
   176,   2059,      0,         0,               0.804
   176,   2059,      0,         1,               0.804
   176,   2048,     11,         0,               0.774
   176,   2048,     11,         1,               0.774
   176,   2059,     11,         0,               0.814
   176,   2059,     11,         1,               0.814
   192,      0,      0,         0,               0.778
   192,      0,      0,         1,               0.778
   192,     12,      0,         0,               0.881
   192,     12,      0,         1,               0.881
   192,      0,     12,         0,               1.167
   192,      0,     12,         1,               1.167
   192,     12,     12,         0,               0.841
   192,     12,     12,         1,               0.841
   192,   2048,      0,         0,               0.778
   192,   2048,      0,         1,               0.778
   192,   2060,      0,         0,               0.881
   192,   2060,      0,         1,               0.881
   192,   2048,     12,         0,               0.889
   192,   2048,     12,         1,               0.889
   192,   2060,     12,         0,               0.906
   192,   2060,     12,         1,               0.906
   208,      0,      0,         0,               0.833
   208,      0,      0,         1,               0.833
   208,     13,      0,         0,               0.921
   208,     13,      0,         1,               0.921
   208,      0,     13,         0,               0.835
   208,      0,     13,         1,               0.833
   208,     13,     13,         0,               1.333
   208,     13,     13,         1,               1.333
   208,   2048,      0,         0,               0.833
   208,   2048,      0,         1,               0.833
   208,   2061,      0,         0,               0.921
   208,   2061,      0,         1,               0.921
   208,   2048,     13,         0,               0.833
   208,   2048,     13,         1,               0.833
   208,   2061,     13,         0,               1.333
   208,   2061,     13,         1,               1.333
   224,      0,      0,         0,                0.93
   224,      0,      0,         1,                0.93
   224,     14,      0,         0,                 1.0
   224,     14,      0,         1,                 1.0
   224,      0,     14,         0,                1.15
   224,      0,     14,         1,                1.15
   224,     14,     14,         0,               1.452
   224,     14,     14,         1,               1.452
   224,   2048,      0,         0,                0.93
   224,   2048,      0,         1,                0.93
   224,   2062,      0,         0,                 1.0
   224,   2062,      0,         1,                 1.0
   224,   2048,     14,         0,               0.833
   224,   2048,     14,         1,               0.833
   224,   2062,     14,         0,               1.452
   224,   2062,     14,         1,               1.452
   240,      0,      0,         0,               0.909
   240,      0,      0,         1,               0.909
   240,     15,      0,         0,               0.797
   240,     15,      0,         1,               0.797
   240,      0,     15,         0,               0.771
   240,      0,     15,         1,               0.771
   240,     15,     15,         0,                0.93
   240,     15,     15,         1,                0.93
   240,   2048,      0,         0,               0.909
   240,   2048,      0,         1,               0.909
   240,   2063,      0,         0,               0.797
   240,   2063,      0,         1,               0.797
   240,   2048,     15,         0,               0.771
   240,   2048,     15,         1,               0.771
   240,   2063,     15,         0,                0.93
   240,   2063,     15,         1,                0.93
   272,      0,      0,         0,                 0.9
   272,      0,      0,         1,                 0.9
   272,     17,      0,         0,               1.015
   272,     17,      0,         1,               1.015
   272,      0,     17,         0,               0.926
   272,      0,     17,         1,               0.927
   272,     17,     17,         0,               0.892
   272,     17,     17,         1,               0.892
   272,   2048,      0,         0,                 0.9
   272,   2048,      0,         1,                 0.9
   272,   2065,      0,         0,               1.015
   272,   2065,      0,         1,               1.015
   272,   2048,     17,         0,               0.927
   272,   2048,     17,         1,               0.927
   272,   2065,     17,         0,               0.878
   272,   2065,     17,         1,               0.878
   288,      0,      0,         0,               0.882
   288,      0,      0,         1,               0.882
   288,     18,      0,         0,               0.803
   288,     18,      0,         1,               0.803
   288,      0,     18,         0,               0.768
   288,      0,     18,         1,               0.768
   288,     18,     18,         0,               0.882
   288,     18,     18,         1,               0.882
   288,   2048,      0,         0,               0.882
   288,   2048,      0,         1,               0.882
   288,   2066,      0,         0,               0.803
   288,   2066,      0,         1,               0.803
   288,   2048,     18,         0,               0.768
   288,   2048,     18,         1,               0.768
   288,   2066,     18,         0,               0.882
   288,   2066,     18,         1,               0.882
   304,      0,      0,         0,               0.865
   304,      0,      0,         1,               0.865
   304,     19,      0,         0,               0.944
   304,     19,      0,         1,               0.944
   304,      0,     19,         0,               0.943
   304,      0,     19,         1,               0.943
   304,     19,     19,         0,               0.956
   304,     19,     19,         1,               0.956
   304,   2048,      0,         0,               0.866
   304,   2048,      0,         1,               0.865
   304,   2067,      0,         0,               0.944
   304,   2067,      0,         1,               0.944
   304,   2048,     19,         0,               0.943
   304,   2048,     19,         1,               0.943
   304,   2067,     19,         0,               0.947
   304,   2067,     19,         1,               0.947
   320,      0,      0,         0,               0.944
   320,      0,      0,         1,               0.944
   320,     20,      0,         0,               0.962
   320,     20,      0,         1,               0.962
   320,      0,     20,         0,               1.214
   320,      0,     20,         1,               1.214
   320,     20,     20,         0,               1.365
   320,     20,     20,         1,               1.365
   320,   2048,      0,         0,               0.943
   320,   2048,      0,         1,               0.943
   320,   2068,      0,         0,               0.962
   320,   2068,      0,         1,               0.962
   320,   2048,     20,         0,               0.914
   320,   2048,     20,         1,               0.914
   320,   2068,     20,         0,               1.365
   320,   2068,     20,         1,               1.365
   336,      0,      0,         0,                 1.0
   336,      0,      0,         1,                 1.0
   336,     21,      0,         0,               0.986
   336,     21,      0,         1,               0.986
   336,      0,     21,         0,               0.853
   336,      0,     21,         1,               0.853
   336,     21,     21,         0,               0.843
   336,     21,     21,         1,               0.843
   336,   2048,      0,         0,                 1.0
   336,   2048,      0,         1,                 1.0
   336,   2069,      0,         0,               0.986
   336,   2069,      0,         1,               0.986
   336,   2048,     21,         0,               0.853
   336,   2048,     21,         1,               0.853
   336,   2069,     21,         0,               0.831
   336,   2069,     21,         1,               0.831
   352,      0,      0,         0,                0.98
   352,      0,      0,         1,                0.98
   352,     22,      0,         0,               0.811
   352,     22,      0,         1,               0.811
   352,      0,     22,         0,               0.882
   352,      0,     22,         1,               0.882
   352,     22,     22,         0,                 1.1
   352,     22,     22,         1,                 1.1
   352,   2048,      0,         0,                0.98
   352,   2048,      0,         1,                0.98
   352,   2070,      0,         0,               0.811
   352,   2070,      0,         1,               0.811
   352,   2048,     22,         0,               0.882
   352,   2048,     22,         1,               0.882
   352,   2070,     22,         0,                 1.1
   352,   2070,     22,         1,                 1.1
   368,      0,      0,         0,               1.058
   368,      0,      0,         1,               1.058
   368,     23,      0,         0,                 1.0
   368,     23,      0,         1,                 1.0
   368,      0,     23,         0,               0.948
   368,      0,     23,         1,               0.948
   368,     23,     23,         0,               0.723
   368,     23,     23,         1,               0.723
   368,   2048,      0,         0,               1.058
   368,   2048,      0,         1,               1.058
   368,   2071,      0,         0,                 1.0
   368,   2071,      0,         1,                 1.0
   368,   2048,     23,         0,               0.948
   368,   2048,     23,         1,               0.948
   368,   2071,     23,         0,               0.701
   368,   2071,     23,         1,               0.701
   384,      0,      0,         0,               1.012
   384,      0,      0,         1,               1.012
   384,     24,      0,         0,                1.04
   384,     24,      0,         1,                1.04
   384,      0,     24,         0,               1.154
   384,      0,     24,         1,               1.154
   384,     24,     24,         0,               1.423
   384,     24,     24,         1,               1.423
   384,   2048,      0,         0,               1.012
   384,   2048,      0,         1,               1.012
   384,   2072,      0,         0,                1.04
   384,   2072,      0,         1,                1.04
   384,   2048,     24,         0,                0.91
   384,   2048,     24,         1,                0.91
   384,   2072,     24,         0,               1.423
   384,   2072,     24,         1,               1.423
   400,      0,      0,         0,               0.948
   400,      0,      0,         1,               0.948
   400,     25,      0,         0,               0.957
   400,     25,      0,         1,               0.957
   400,      0,     25,         0,               1.099
   400,      0,     25,         1,               1.069
   400,     25,     25,         0,               0.885
   400,     25,     25,         1,               0.885
   400,   2048,      0,         0,               0.948
   400,   2048,      0,         1,               0.948
   400,   2073,      0,         0,               0.957
   400,   2073,      0,         1,               0.957
   400,   2048,     25,         0,                0.94
   400,   2048,     25,         1,                0.94
   400,   2073,     25,         0,               0.908
   400,   2073,     25,         1,               0.908
   416,      0,      0,         0,               1.017
   416,      0,      0,         1,               1.017
   416,     26,      0,         0,               0.903
   416,     26,      0,         1,               0.903
   416,      0,     26,         0,               0.881
   416,      0,     26,         1,               0.881
   416,     26,     26,         0,               1.035
   416,     26,     26,         1,               1.035
   416,   2048,      0,         0,               1.017
   416,   2048,      0,         1,               1.017
   416,   2074,      0,         0,               0.903
   416,   2074,      0,         1,               0.903
   416,   2048,     26,         0,               0.881
   416,   2048,     26,         1,               0.881
   416,   2074,     26,         0,               1.034
   416,   2074,     26,         1,               1.035
   432,      0,      0,         0,                 1.0
   432,      0,      0,         1,                 1.0
   432,     27,      0,         0,               0.933
   432,     27,      0,         1,               0.933
   432,      0,     27,         0,               0.941
   432,      0,     27,         1,               0.941
   432,     27,     27,         0,               0.953
   432,     27,     27,         1,               0.954
   432,   2048,      0,         0,                 1.0
   432,   2048,      0,         1,                 1.0
   432,   2075,      0,         0,               0.933
   432,   2075,      0,         1,               0.933
   432,   2048,     27,         0,               0.941
   432,   2048,     27,         1,               0.941
   432,   2075,     27,         0,                0.93
   432,   2075,     27,         1,                0.93
   448,      0,      0,         0,               0.984
   448,      0,      0,         1,               0.984
   448,     28,      0,         0,               0.896
   448,     28,      0,         1,               0.896
   448,      0,     28,         0,               1.244
   448,      0,     28,         1,               1.244
   448,     28,     28,         0,               1.333
   448,     28,     28,         1,               1.333
   448,   2048,      0,         0,               0.984
   448,   2048,      0,         1,               0.984
   448,   2076,      0,         0,               0.896
   448,   2076,      0,         1,               0.896
   448,   2048,     28,         0,               0.988
   448,   2048,     28,         1,               0.988
   448,   2076,     28,         0,               1.333
   448,   2076,     28,         1,               1.333
   464,      0,      0,         0,               1.083
   464,      0,      0,         1,               1.083
   464,     29,      0,         0,               0.978
   464,     29,      0,         1,               0.978
   464,      0,     29,         0,               0.924
   464,      0,     29,         1,               0.924
   464,     29,     29,         0,               0.901
   464,     29,     29,         1,               0.901
   464,   2048,      0,         0,               1.083
   464,   2048,      0,         1,               1.083
   464,   2077,      0,         0,               0.978
   464,   2077,      0,         1,               0.978
   464,   2048,     29,         0,               0.924
   464,   2048,     29,         1,               0.924
   464,   2077,     29,         0,                0.89
   464,   2077,     29,         1,                0.89
   480,      0,      0,         0,               1.066
   480,      0,      0,         1,               1.066
   480,     30,      0,         0,                 0.9
   480,     30,      0,         1,                 0.9
   480,      0,     30,         0,                0.88
   480,      0,     30,         1,                0.88
   480,     30,     30,         0,               1.083
   480,     30,     30,         1,               1.083
   480,   2048,      0,         0,               1.066
   480,   2048,      0,         1,               1.066
   480,   2078,      0,         0,                 0.9
   480,   2078,      0,         1,                 0.9
   480,   2048,     30,         0,                0.88
   480,   2048,     30,         1,                0.88
   480,   2078,     30,         0,               1.083
   480,   2078,     30,         1,               1.083
   496,      0,      0,         0,               1.032
   496,      0,      0,         1,               1.032
   496,     31,      0,         0,                0.95
   496,     31,      0,         1,                0.95
   496,      0,     31,         0,               1.011
   496,      0,     31,         1,               1.011
   496,     31,     31,         0,               0.973
   496,     31,     31,         1,               0.973
   496,   2048,      0,         0,               1.032
   496,   2048,      0,         1,               1.032
   496,   2079,      0,         0,                0.95
   496,   2079,      0,         1,                0.95
   496,   2048,     31,         0,               1.011
   496,   2048,     31,         1,               1.011
   496,   2079,     31,         0,               0.941
   496,   2079,     31,         1,               0.941
  1024,     32,      0,         0,               1.143
  1024,     32,      0,         1,               1.143
  1024,      0,     32,         0,               1.143
  1024,      0,     32,         1,               1.143
  1024,     32,     32,         0,               1.143
  1024,     32,     32,         1,               1.143
  1024,   2080,      0,         0,               1.143
  1024,   2080,      0,         1,               1.143
  1024,   2048,     32,         0,               1.143
  1024,   2048,     32,         1,               1.143
  1024,   2080,     32,         0,               1.143
  1024,   2080,     32,         1,               1.143
  1056,      0,      0,         0,               1.168
  1056,      0,      0,         1,               1.168
  1056,     33,      0,         0,               1.067
  1056,     33,      0,         1,               1.067
  1056,      0,     33,         0,               0.977
  1056,      0,     33,         1,               0.977
  1056,     33,     33,         0,               1.043
  1056,     33,     33,         1,               1.043
  1056,   2048,      0,         0,               1.168
  1056,   2048,      0,         1,               1.168
  1056,   2081,      0,         0,               1.067
  1056,   2081,      0,         1,               1.067
  1056,   2048,     33,         0,               0.977
  1056,   2048,     33,         1,               0.977
  1056,   2081,     33,         0,                 1.0
  1056,   2081,     33,         1,                 1.0
  1088,      0,      0,         0,               1.171
  1088,      0,      0,         1,               1.171
  1088,     34,      0,         0,               1.041
  1088,     34,      0,         1,               1.041
  1088,      0,     34,         0,               1.079
  1088,      0,     34,         1,               1.079
  1088,     34,     34,         0,               0.966
  1088,     34,     34,         1,               0.966
  1088,   2048,      0,         0,               1.171
  1088,   2048,      0,         1,               1.171
  1088,   2082,      0,         0,               1.041
  1088,   2082,      0,         1,               1.041
  1088,   2048,     34,         0,               0.994
  1088,   2048,     34,         1,               0.994
  1088,   2082,     34,         0,               0.966
  1088,   2082,     34,         1,               0.966
  1120,      0,      0,         0,               1.152
  1120,      0,      0,         1,               1.153
  1120,     35,      0,         0,               1.051
  1120,     35,      0,         1,               1.051
  1120,      0,     35,         0,                 1.0
  1120,      0,     35,         1,                 1.0
  1120,     35,     35,         0,               1.068
  1120,     35,     35,         1,               1.068
  1120,   2048,      0,         0,               1.151
  1120,   2048,      0,         1,               1.151
  1120,   2083,      0,         0,               1.051
  1120,   2083,      0,         1,               1.051
  1120,   2048,     35,         0,                 1.0
  1120,   2048,     35,         1,                 1.0
  1120,   2083,     35,         0,               1.027
  1120,   2083,     35,         1,               1.027
  1152,      0,      0,         0,               1.159
  1152,      0,      0,         1,               1.159
  1152,     36,      0,         0,               1.034
  1152,     36,      0,         1,               1.034
  1152,      0,     36,         0,                1.07
  1152,      0,     36,         1,                1.07
  1152,     36,     36,         0,               0.967
  1152,     36,     36,         1,               0.967
  1152,   2048,      0,         0,               1.159
  1152,   2048,      0,         1,               1.159
  1152,   2084,      0,         0,               1.034
  1152,   2084,      0,         1,               1.034
  1152,   2048,     36,         0,               0.984
  1152,   2048,     36,         1,               0.984
  1152,   2084,     36,         0,               0.967
  1152,   2084,     36,         1,               0.967
  1184,      0,      0,         0,               1.157
  1184,      0,      0,         1,               1.157
  1184,     37,      0,         0,               1.067
  1184,     37,      0,         1,               1.066
  1184,      0,     37,         0,               0.993
  1184,      0,     37,         1,               0.993
  1184,     37,     37,         0,                1.08
  1184,     37,     37,         1,               1.081
  1184,   2048,      0,         0,               1.157
  1184,   2048,      0,         1,               1.157
  1184,   2085,      0,         0,               1.066
  1184,   2085,      0,         1,               1.066
  1184,   2048,     37,         0,               0.993
  1184,   2048,     37,         1,               0.993
  1184,   2085,     37,         0,                1.04
  1184,   2085,     37,         1,                1.04
  1216,      0,      0,         0,               1.139
  1216,      0,      0,         1,               1.139
  1216,     38,      0,         0,               1.024
  1216,     38,      0,         1,               1.024
  1216,      0,     38,         0,               1.087
  1216,      0,     38,         1,               1.087
  1216,     38,     38,         0,                 1.0
  1216,     38,     38,         1,                 1.0
  1216,   2048,      0,         0,               1.138
  1216,   2048,      0,         1,               1.138
  1216,   2086,      0,         0,               1.024
  1216,   2086,      0,         1,               1.024
  1216,   2048,     38,         0,                1.01
  1216,   2048,     38,         1,                1.01
  1216,   2086,     38,         0,                 1.0
  1216,   2086,     38,         1,                 1.0
  1248,      0,      0,         0,               1.176
  1248,      0,      0,         1,               1.174
  1248,     39,      0,         0,               1.074
  1248,     39,      0,         1,               1.074
  1248,      0,     39,         0,               0.966
  1248,      0,     39,         1,               0.985
  1248,     39,     39,         0,               1.064
  1248,     39,     39,         1,               1.064
  1248,   2048,      0,         0,               1.179
  1248,   2048,      0,         1,               1.179
  1248,   2087,      0,         0,               1.074
  1248,   2087,      0,         1,               1.074
  1248,   2048,     39,         0,               0.985
  1248,   2048,     39,         1,               0.985
  1248,   2087,     39,         0,               1.026
  1248,   2087,     39,         1,               1.026
  1280,      0,      0,         0,               0.993
  1280,      0,      0,         1,               0.993
  1280,     40,      0,         0,               1.051
  1280,     40,      0,         1,               1.051
  1280,      0,     40,         0,               1.044
  1280,      0,     40,         1,               1.045
  1280,     40,     40,         0,                1.25
  1280,     40,     40,         1,                1.25
  1280,   2048,      0,         0,               0.992
  1280,   2048,      0,         1,               0.992
  1280,   2088,      0,         0,               1.051
  1280,   2088,      0,         1,               1.051
  1280,   2048,     40,         0,               0.946
  1280,   2048,     40,         1,               0.946
  1280,   2088,     40,         0,               1.252
  1280,   2088,     40,         1,               1.252
  1312,      0,      0,         0,               0.969
  1312,      0,      0,         1,               0.969
  1312,     41,      0,         0,               0.991
  1312,     41,      0,         1,               0.991
  1312,      0,     41,         0,               0.837
  1312,      0,     41,         1,               0.837
  1312,     41,     41,         0,               1.025
  1312,     41,     41,         1,               1.025
  1312,   2048,      0,         0,               0.969
  1312,   2048,      0,         1,               0.969
  1312,   2089,      0,         0,               0.991
  1312,   2089,      0,         1,                0.99
  1312,   2048,     41,         0,               0.837
  1312,   2048,     41,         1,               0.837
  1312,   2089,     41,         0,               0.975
  1312,   2089,     41,         1,               0.975
  1344,      0,      0,         0,               0.988
  1344,      0,      0,         1,               0.988
  1344,     42,      0,         0,               1.031
  1344,     42,      0,         1,               1.031
  1344,      0,     42,         0,               1.033
  1344,      0,     42,         1,               1.033
  1344,     42,     42,         0,               0.982
  1344,     42,     42,         1,               0.982
  1344,   2048,      0,         0,               0.992
  1344,   2048,      0,         1,               0.992
  1344,   2090,      0,         0,               1.031
  1344,   2090,      0,         1,               1.031
  1344,   2048,     42,         0,               0.943
  1344,   2048,     42,         1,               0.942
  1344,   2090,     42,         0,               0.982
  1344,   2090,     42,         1,               0.982
  1376,      0,      0,         0,               1.016
  1376,      0,      0,         1,               1.016
  1376,     43,      0,         0,                1.01
  1376,     43,      0,         1,                1.01
  1376,      0,     43,         0,               0.829
  1376,      0,     43,         1,               0.829
  1376,     43,     43,         0,               1.024
  1376,     43,     43,         1,               1.024
  1376,   2048,      0,         0,               1.006
  1376,   2048,      0,         1,               1.015
  1376,   2091,      0,         0,                1.01
  1376,   2091,      0,         1,                1.01
  1376,   2048,     43,         0,               0.829
  1376,   2048,     43,         1,               0.829
  1376,   2091,     43,         0,                0.98
  1376,   2091,     43,         1,                0.98
  1408,      0,      0,         0,               0.987
  1408,      0,      0,         1,               0.987
  1408,     44,      0,         0,               1.015
  1408,     44,      0,         1,               1.015
  1408,      0,     44,         0,               1.018
  1408,      0,     44,         1,               1.014
  1408,     44,     44,         0,               1.004
  1408,     44,     44,         1,               0.994
  1408,   2048,      0,         0,               0.988
  1408,   2048,      0,         1,               0.988
  1408,   2092,      0,         0,               1.015
  1408,   2092,      0,         1,               1.015
  1408,   2048,     44,         0,               0.955
  1408,   2048,     44,         1,               0.955
  1408,   2092,     44,         0,                 1.0
  1408,   2092,     44,         1,               0.994
  1440,      0,      0,         0,               0.986
  1440,      0,      0,         1,               0.986
  1440,     45,      0,         0,               1.013
  1440,     45,      0,         1,               1.013
  1440,      0,     45,         0,               0.814
  1440,      0,     45,         1,               0.814
  1440,     45,     45,         0,               1.006
  1440,     45,     45,         1,               1.006
  1440,   2048,      0,         0,               0.986
  1440,   2048,      0,         1,               0.986
  1440,   2093,      0,         0,               1.013
  1440,   2093,      0,         1,               1.013
  1440,   2048,     45,         0,               0.814
  1440,   2048,     45,         1,               0.814
  1440,   2093,     45,         0,               0.966
  1440,   2093,     45,         1,               0.966
  1472,      0,      0,         0,               0.997
  1472,      0,      0,         1,               0.994
  1472,     46,      0,         0,               1.045
  1472,     46,      0,         1,               1.045
  1472,      0,     46,         0,               1.026
  1472,      0,     46,         1,               1.026
  1472,     46,     46,         0,               0.966
  1472,     46,     46,         1,               0.966
  1472,   2048,      0,         0,                 1.0
  1472,   2048,      0,         1,               0.996
  1472,   2094,      0,         0,               1.045
  1472,   2094,      0,         1,               1.045
  1472,   2048,     46,         0,               0.939
  1472,   2048,     46,         1,               0.939
  1472,   2094,     46,         0,               0.966
  1472,   2094,     46,         1,               0.966
  1504,      0,      0,         0,               0.993
  1504,      0,      0,         1,               0.993
  1504,     47,      0,         0,               0.999
  1504,     47,      0,         1,               0.999
  1504,      0,     47,         0,               0.826
  1504,      0,     47,         1,               0.826
  1504,     47,     47,         0,               1.023
  1504,     47,     47,         1,               1.023
  1504,   2048,      0,         0,               0.993
  1504,   2048,      0,         1,               0.993
  1504,   2095,      0,         0,               0.999
  1504,   2095,      0,         1,               0.999
  1504,   2048,     47,         0,               0.826
  1504,   2048,     47,         1,               0.826
  1504,   2095,     47,         0,               0.993
  1504,   2095,     47,         1,               0.993
  1536,      0,      0,         0,               0.992
  1536,      0,      0,         1,               0.991
  1536,     48,      0,         0,               1.019
  1536,     48,      0,         1,               1.019
  1536,      0,     48,         0,               1.025
  1536,      0,     48,         1,               1.024
  1536,     48,     48,         0,               0.994
  1536,     48,     48,         1,               0.994
  1536,   2048,      0,         0,               0.994
  1536,   2048,      0,         1,               0.994
  1536,   2096,      0,         0,               1.019
  1536,   2096,      0,         1,               1.019
  1536,   2048,     48,         0,               1.025
  1536,   2048,     48,         1,               1.025
  1536,   2096,     48,         0,               0.994
  1536,   2096,     48,         1,               0.994
  1568,      0,      0,         0,               0.994
  1568,      0,      0,         1,               0.994
  1568,     49,      0,         0,               0.903
  1568,     49,      0,         1,               0.903
  1568,      0,     49,         0,               1.144
  1568,      0,     49,         1,               1.144
  1568,     49,     49,         0,               1.461
  1568,     49,     49,         1,               1.461
  1568,   2048,      0,         0,               0.993
  1568,   2048,      0,         1,               0.993
  1568,   2097,      0,         0,               0.903
  1568,   2097,      0,         1,               0.903
  1568,   2048,     49,         0,                1.09
  1568,   2048,     49,         1,                1.09
  1568,   2097,     49,         0,                1.46
  1568,   2097,     49,         1,                1.46
  1600,      0,      0,         0,               0.981
  1600,      0,      0,         1,               0.981
  1600,     50,      0,         0,               1.022
  1600,     50,      0,         1,               1.022
  1600,      0,     50,         0,               1.017
  1600,      0,     50,         1,               1.017
  1600,     50,     50,         0,               0.973
  1600,     50,     50,         1,               0.973
  1600,   2048,      0,         0,               0.981
  1600,   2048,      0,         1,               0.981
  1600,   2098,      0,         0,               1.022
  1600,   2098,      0,         1,               1.022
  1600,   2048,     50,         0,               0.961
  1600,   2048,     50,         1,               0.961
  1600,   2098,     50,         0,               0.973
  1600,   2098,     50,         1,               0.973
  1632,      0,      0,         0,               1.019
  1632,      0,      0,         1,               1.019
  1632,     51,      0,         0,               0.893
  1632,     51,      0,         1,               0.893
  1632,      0,     51,         0,               1.131
  1632,      0,     51,         1,               1.131
  1632,     51,     51,         0,               1.444
  1632,     51,     51,         1,               1.444
  1632,   2048,      0,         0,               1.019
  1632,   2048,      0,         1,               1.019
  1632,   2099,      0,         0,               0.893
  1632,   2099,      0,         1,               0.893
  1632,   2048,     51,         0,               1.079
  1632,   2048,     51,         1,               1.079
  1632,   2099,     51,         0,               1.449
  1632,   2099,     51,         1,               1.449
  1664,      0,      0,         0,               1.005
  1664,      0,      0,         1,               1.004
  1664,     52,      0,         0,               0.986
  1664,     52,      0,         1,               0.986
  1664,      0,     52,         0,               1.004
  1664,      0,     52,         1,               1.004
  1664,     52,     52,         0,               0.976
  1664,     52,     52,         1,               0.976
  1664,   2048,      0,         0,               1.006
  1664,   2048,      0,         1,               1.006
  1664,   2100,      0,         0,               0.993
  1664,   2100,      0,         1,               0.993
  1664,   2048,     52,         0,               0.946
  1664,   2048,     52,         1,               0.946
  1664,   2100,     52,         0,               0.976
  1664,   2100,     52,         1,               0.976
  1696,      0,      0,         0,               0.994
  1696,      0,      0,         1,               0.992
  1696,     53,      0,         0,               0.884
  1696,     53,      0,         1,               0.884
  1696,      0,     53,         0,               1.141
  1696,      0,     53,         1,               1.141
  1696,     53,     53,         0,                1.43
  1696,     53,     53,         1,                1.43
  1696,   2048,      0,         0,               0.994
  1696,   2048,      0,         1,               0.994
  1696,   2101,      0,         0,               0.884
  1696,   2101,      0,         1,               0.884
  1696,   2048,     53,         0,               1.088
  1696,   2048,     53,         1,               1.088
  1696,   2101,     53,         0,               1.429
  1696,   2101,     53,         1,               1.429
  1728,      0,      0,         0,               0.978
  1728,      0,      0,         1,               0.978
  1728,     54,      0,         0,               1.031
  1728,     54,      0,         1,               1.033
  1728,      0,     54,         0,                 1.0
  1728,      0,     54,         1,                 1.0
  1728,     54,     54,         0,                0.96
  1728,     54,     54,         1,                0.96
  1728,   2048,      0,         0,               0.976
  1728,   2048,      0,         1,               0.976
  1728,   2102,      0,         0,               1.033
  1728,   2102,      0,         1,               1.033
  1728,   2048,     54,         0,               0.947
  1728,   2048,     54,         1,               0.947
  1728,   2102,     54,         0,                0.96
  1728,   2102,     54,         1,                0.96
  1760,      0,      0,         0,               1.019
  1760,      0,      0,         1,               1.021
  1760,     55,      0,         0,                 0.9
  1760,     55,      0,         1,                 0.9
  1760,      0,     55,         0,               1.125
  1760,      0,     55,         1,               1.125
  1760,     55,     55,         0,               1.437
  1760,     55,     55,         1,               1.436
  1760,   2048,      0,         0,               1.016
  1760,   2048,      0,         1,               1.015
  1760,   2103,      0,         0,                 0.9
  1760,   2103,      0,         1,                 0.9
  1760,   2048,     55,         0,               1.073
  1760,   2048,     55,         1,               1.074
  1760,   2103,     55,         0,                1.44
  1760,   2103,     55,         1,                1.44
  1792,      0,      0,         0,               1.002
  1792,      0,      0,         1,               1.002
  1792,     56,      0,         0,               1.028
  1792,     56,      0,         1,               1.028
  1792,      0,     56,         0,               1.014
  1792,      0,     56,         1,               1.015
  1792,     56,     56,         0,               1.191
  1792,     56,     56,         1,               1.191
  1792,   2048,      0,         0,               1.003
  1792,   2048,      0,         1,               1.003
  1792,   2104,      0,         0,               1.028
  1792,   2104,      0,         1,               1.028
  1792,   2048,     56,         0,               0.963
  1792,   2048,     56,         1,               0.963
  1792,   2104,     56,         0,               1.191
  1792,   2104,     56,         1,               1.191
  1824,      0,      0,         0,               0.999
  1824,      0,      0,         1,                 1.0
  1824,     57,      0,         0,               0.891
  1824,     57,      0,         1,               0.891
  1824,      0,     57,         0,               1.114
  1824,      0,     57,         1,               1.114
  1824,     57,     57,         0,               1.407
  1824,     57,     57,         1,               1.407
  1824,   2048,      0,         0,               1.001
  1824,   2048,      0,         1,               1.001
  1824,   2105,      0,         0,               0.891
  1824,   2105,      0,         1,               0.891
  1824,   2048,     57,         0,               1.064
  1824,   2048,     57,         1,               1.064
  1824,   2105,     57,         0,               1.407
  1824,   2105,     57,         1,               1.407
  1856,      0,      0,         0,               0.989
  1856,      0,      0,         1,               0.987
  1856,     58,      0,         0,               1.042
  1856,     58,      0,         1,               1.042
  1856,      0,     58,         0,               1.007
  1856,      0,     58,         1,               1.007
  1856,     58,     58,         0,               0.978
  1856,     58,     58,         1,               0.972
  1856,   2048,      0,         0,               0.992
  1856,   2048,      0,         1,               0.992
  1856,   2106,      0,         0,               1.042
  1856,   2106,      0,         1,               1.042
  1856,   2048,     58,         0,               0.954
  1856,   2048,     58,         1,               0.954
  1856,   2106,     58,         0,               0.979
  1856,   2106,     58,         1,               0.972
  1888,      0,      0,         0,               0.994
  1888,      0,      0,         1,               0.994
  1888,     59,      0,         0,               0.883
  1888,     59,      0,         1,               0.883
  1888,      0,     59,         0,               1.121
  1888,      0,     59,         1,               1.123
  1888,     59,     59,         0,               1.413
  1888,     59,     59,         1,               1.413
  1888,   2048,      0,         0,               0.985
  1888,   2048,      0,         1,               0.994
  1888,   2107,      0,         0,               0.883
  1888,   2107,      0,         1,               0.883
  1888,   2048,     59,         0,               1.076
  1888,   2048,     59,         1,               1.076
  1888,   2107,     59,         0,               1.413
  1888,   2107,     59,         1,               1.413
  1920,      0,      0,         0,                 1.0
  1920,      0,      0,         1,               0.999
  1920,     60,      0,         0,               1.033
  1920,     60,      0,         1,               1.033
  1920,      0,     60,         0,               0.996
  1920,      0,     60,         1,               0.997
  1920,     60,     60,         0,               0.968
  1920,     60,     60,         1,               0.968
  1920,   2048,      0,         0,                 1.0
  1920,   2048,      0,         1,                 1.0
  1920,   2108,      0,         0,               1.034
  1920,   2108,      0,         1,               1.034
  1920,   2048,     60,         0,               0.949
  1920,   2048,     60,         1,               0.949
  1920,   2108,     60,         0,               0.968
  1920,   2108,     60,         1,               0.968
  1952,      0,      0,         0,               1.004
  1952,      0,      0,         1,               1.004
  1952,     61,      0,         0,               0.898
  1952,     61,      0,         1,               0.898
  1952,      0,     61,         0,               1.118
  1952,      0,     61,         1,               1.118
  1952,     61,     61,         0,               1.387
  1952,     61,     61,         1,               1.387
  1952,   2048,      0,         0,               1.004
  1952,   2048,      0,         1,               1.004
  1952,   2109,      0,         0,               0.898
  1952,   2109,      0,         1,               0.898
  1952,   2048,     61,         0,               1.071
  1952,   2048,     61,         1,               1.071
  1952,   2109,     61,         0,               1.387
  1952,   2109,     61,         1,               1.387
  1984,      0,      0,         0,               0.993
  1984,      0,      0,         1,               0.993
  1984,     62,      0,         0,               1.025
  1984,     62,      0,         1,               1.025
  1984,      0,     62,         0,               1.005
  1984,      0,     62,         1,               1.007
  1984,     62,     62,         0,               0.982
  1984,     62,     62,         1,               0.982
  1984,   2048,      0,         0,               0.993
  1984,   2048,      0,         1,               0.993
  1984,   2110,      0,         0,               1.025
  1984,   2110,      0,         1,               1.025
  1984,   2048,     62,         0,                0.96
  1984,   2048,     62,         1,                0.96
  1984,   2110,     62,         0,               0.982
  1984,   2110,     62,         1,               0.982
  2016,      0,      0,         0,                 1.0
  2016,      0,      0,         1,               0.999
  2016,     63,      0,         0,               0.889
  2016,     63,      0,         1,                0.89
  2016,      0,     63,         0,               1.091
  2016,      0,     63,         1,               1.092
  2016,     63,     63,         0,               1.362
  2016,     63,     63,         1,               1.363
  2016,   2048,      0,         0,                 1.0
  2016,   2048,      0,         1,                 1.0
  2016,   2111,      0,         0,               0.965
  2016,   2111,      0,         1,               0.965
  2016,   2048,     63,         0,               1.049
  2016,   2048,     63,         1,               1.049
  2016,   2111,     63,         0,               1.405
  2016,   2111,     63,         1,               1.405
  2048,     32,      0,         0,                1.01
  2048,     32,      0,         1,                1.01
  2048,      0,     32,         0,               1.005
  2048,      0,     32,         1,               1.005
  2048,     32,     32,         0,               1.005
  2048,     32,     32,         1,               1.005
  2048,      0,      1,         0,               0.983
  2048,      0,      1,         1,               0.984
  2048,      1,      0,         0,               1.039
  2048,      1,      0,         1,               1.039
  2048,     32,      1,         0,               1.063
  2048,     32,      1,         1,               1.063
  2048,      1,     32,         0,                0.94
  2048,      1,     32,         1,                0.94
  2048,   2048,      1,         0,               0.981
  2048,   2048,      1,         1,               0.981
  2048,   2049,      0,         0,               0.904
  2048,   2049,      0,         1,               0.904
  2112,      0,      0,         0,               0.996
  2112,      0,      0,         1,               0.995
  2112,      1,      0,         0,               1.031
  2112,      1,      0,         1,               1.031
  2112,     33,      0,         0,                1.01
  2112,     33,      0,         1,                1.01
  2112,      0,      1,         0,               0.972
  2112,      0,      1,         1,               0.972
  2112,      0,     33,         0,               0.987
  2112,      0,     33,         1,               0.987
  2112,      1,      1,         0,               0.914
  2112,      1,      1,         1,               0.914
  2112,     33,     33,         0,               0.983
  2112,     33,     33,         1,               0.983
  2112,   2048,      0,         0,               0.994
  2112,   2048,      0,         1,                0.99
  2112,   2049,      0,         0,               1.031
  2112,   2049,      0,         1,               1.031
  2112,   2048,      1,         0,               0.955
  2112,   2048,      1,         1,               0.955
  2112,   2049,      1,         0,               0.906
  2112,   2049,      1,         1,               0.906
  2112,     33,      1,         0,               1.163
  2112,     33,      1,         1,               1.164
  2112,      1,     33,         0,               1.046
  2112,      1,     33,         1,               1.046
  2176,      0,      0,         0,               0.984
  2176,      0,      0,         1,               0.985
  2176,      2,      0,         0,               1.023
  2176,      2,      0,         1,               1.023
  2176,     34,      0,         0,                 1.0
  2176,     34,      0,         1,                 1.0
  2176,      0,      2,         0,               0.985
  2176,      0,      2,         1,               0.985
  2176,      0,     34,         0,               0.995
  2176,      0,     34,         1,               0.982
  2176,      2,      2,         0,               0.928
  2176,      2,      2,         1,               0.928
  2176,     34,     34,         0,               1.004
  2176,     34,     34,         1,               1.004
  2176,   2048,      0,         0,               0.985
  2176,   2048,      0,         1,               0.986
  2176,   2050,      0,         0,               1.023
  2176,   2050,      0,         1,               1.023
  2176,   2048,      2,         0,               0.802
  2176,   2048,      2,         1,               0.802
  2176,   2050,      2,         0,               0.894
  2176,   2050,      2,         1,               0.894
  2176,      2,      1,         0,               1.068
  2176,      2,      1,         1,               1.068
  2176,      1,      2,         0,               0.976
  2176,      1,      2,         1,               0.976
  2176,     34,      1,         0,               1.077
  2176,     34,      1,         1,               1.077
  2176,      1,     34,         0,               0.978
  2176,      1,     34,         1,               0.978
  2176,   2050,      1,         0,               1.061
  2176,   2050,      1,         1,               1.061
  2176,   2049,      2,         0,               0.971
  2176,   2049,      2,         1,               0.971
  2240,      0,      0,         0,               0.994
  2240,      0,      0,         1,               0.994
  2240,      3,      0,         0,               1.038
  2240,      3,      0,         1,               1.039
  2240,     35,      0,         0,               1.019
  2240,     35,      0,         1,               1.019
  2240,      0,      3,         0,               0.979
  2240,      0,      3,         1,                0.98
  2240,      0,     35,         0,               0.991
  2240,      0,     35,         1,               0.991
  2240,      3,      3,         0,               0.931
  2240,      3,      3,         1,               0.931
  2240,     35,     35,         0,               0.999
  2240,     35,     35,         1,               0.999
  2240,   2048,      0,         0,               0.995
  2240,   2048,      0,         1,               0.995
  2240,   2051,      0,         0,               1.039
  2240,   2051,      0,         1,               1.039
  2240,   2048,      3,         0,               0.799
  2240,   2048,      3,         1,               0.799
  2240,   2051,      3,         0,               0.889
  2240,   2051,      3,         1,               0.889
  2240,      3,      1,         0,                1.06
  2240,      3,      1,         1,                1.06
  2240,      1,      3,         0,               0.968
  2240,      1,      3,         1,               0.968
  2240,     35,      1,         0,               1.071
  2240,     35,      1,         1,               1.071
  2240,      1,     35,         0,               0.971
  2240,      1,     35,         1,               0.971
  2240,   2051,      1,         0,               1.057
  2240,   2051,      1,         1,               1.057
  2240,   2049,      3,         0,               0.966
  2240,   2049,      3,         1,               0.966
  2304,      0,      0,         0,               0.986
  2304,      0,      0,         1,               0.986
  2304,      4,      0,         0,               1.031
  2304,      4,      0,         1,               1.032
  2304,     36,      0,         0,               1.011
  2304,     36,      0,         1,               1.011
  2304,      0,      4,         0,               0.968
  2304,      0,      4,         1,               0.969
  2304,      0,     36,         0,               0.988
  2304,      0,     36,         1,               0.988
  2304,      4,      4,         0,                0.93
  2304,      4,      4,         1,               0.931
  2304,     36,     36,         0,               0.992
  2304,     36,     36,         1,               0.992
  2304,   2048,      0,         0,               0.988
  2304,   2048,      0,         1,               0.988
  2304,   2052,      0,         0,               1.032
  2304,   2052,      0,         1,               1.032
  2304,   2048,      4,         0,               0.793
  2304,   2048,      4,         1,               0.793
  2304,   2052,      4,         0,               0.884
  2304,   2052,      4,         1,               0.884
  2304,      4,      1,         0,               0.989
  2304,      4,      1,         1,               0.989
  2304,      1,      4,         0,               0.897
  2304,      1,      4,         1,               0.898
  2304,     36,      1,         0,               1.057
  2304,     36,      1,         1,               1.057
  2304,      1,     36,         0,               0.966
  2304,      1,     36,         1,               0.966
  2304,   2052,      1,         0,               1.052
  2304,   2052,      1,         1,               1.052
  2304,   2049,      4,         0,               0.955
  2304,   2049,      4,         1,               0.955
  2368,      0,      0,         0,                 1.0
  2368,      0,      0,         1,               1.001
  2368,      5,      0,         0,               1.024
  2368,      5,      0,         1,               1.025
  2368,     37,      0,         0,                 1.0
  2368,     37,      0,         1,                 1.0
  2368,      0,      5,         0,                0.98
  2368,      0,      5,         1,               0.981
  2368,      0,     37,         0,               0.983
  2368,      0,     37,         1,                0.98
  2368,      5,      5,         0,               0.944
  2368,      5,      5,         1,               0.944
  2368,     37,     37,         0,               1.003
  2368,     37,     37,         1,               1.003
  2368,   2048,      0,         0,               1.002
  2368,   2048,      0,         1,               1.002
  2368,   2053,      0,         0,               1.025
  2368,   2053,      0,         1,               1.025
  2368,   2048,      5,         0,               0.801
  2368,   2048,      5,         1,               0.801
  2368,   2053,      5,         0,               0.907
  2368,   2053,      5,         1,               0.907
  2368,      5,      1,         0,               1.071
  2368,      5,      1,         1,               1.071
  2368,      1,      5,         0,               0.973
  2368,      1,      5,         1,               0.973
  2368,     37,      1,         0,                1.07
  2368,     37,      1,         1,                1.07
  2368,      1,     37,         0,               0.974
  2368,      1,     37,         1,               0.974
  2368,   2053,      1,         0,               1.065
  2368,   2053,      1,         1,               1.065
  2368,   2049,      5,         0,               0.967
  2368,   2049,      5,         1,               0.967
  2432,      0,      0,         0,               0.965
  2432,      0,      0,         1,                 1.0
  2432,      6,      0,         0,               1.038
  2432,      6,      0,         1,               1.039
  2432,     38,      0,         0,               1.021
  2432,     38,      0,         1,               1.021
  2432,      0,      6,         0,               0.974
  2432,      0,      6,         1,               0.976
  2432,      0,     38,         0,               0.986
  2432,      0,     38,         1,               0.986
  2432,      6,      6,         0,               0.926
  2432,      6,      6,         1,               0.926
  2432,     38,     38,         0,                 1.0
  2432,     38,     38,         1,                 1.0
  2432,   2048,      0,         0,               1.004
  2432,   2048,      0,         1,               1.004
  2432,   2054,      0,         0,               1.039
  2432,   2054,      0,         1,               1.039
  2432,   2048,      6,         0,               0.797
  2432,   2048,      6,         1,               0.797
  2432,   2054,      6,         0,               0.898
  2432,   2054,      6,         1,               0.898
  2432,      6,      1,         0,               1.063
  2432,      6,      1,         1,               1.063
  2432,      1,      6,         0,               0.965
  2432,      1,      6,         1,               0.965
  2432,     38,      1,         0,               1.068
  2432,     38,      1,         1,               1.068
  2432,      1,     38,         0,               0.968
  2432,      1,     38,         1,               0.968
  2432,   2054,      1,         0,                1.06
  2432,   2054,      1,         1,                1.06
  2432,   2049,      6,         0,               0.963
  2432,   2049,      6,         1,               0.963
  2496,      0,      0,         0,               1.013
  2496,      0,      0,         1,               1.013
  2496,      7,      0,         0,               1.032
  2496,      7,      0,         1,               1.032
  2496,     39,      0,         0,               1.013
  2496,     39,      0,         1,               1.013
  2496,      0,      7,         0,               0.965
  2496,      0,      7,         1,               0.965
  2496,      0,     39,         0,               0.979
  2496,      0,     39,         1,               0.979
  2496,      7,      7,         0,               0.925
  2496,      7,      7,         1,               0.925
  2496,     39,     39,         0,               0.989
  2496,     39,     39,         1,               0.989
  2496,   2048,      0,         0,               1.013
  2496,   2048,      0,         1,               1.013
  2496,   2055,      0,         0,               1.032
  2496,   2055,      0,         1,               1.032
  2496,   2048,      7,         0,               0.792
  2496,   2048,      7,         1,               0.792
  2496,   2055,      7,         0,                0.93
  2496,   2055,      7,         1,                0.93
  2496,      7,      1,         0,               0.984
  2496,      7,      1,         1,               0.984
  2496,      1,      7,         0,               0.894
  2496,      1,      7,         1,               0.895
  2496,     39,      1,         0,               1.054
  2496,     39,      1,         1,               1.054
  2496,      1,     39,         0,               0.963
  2496,      1,     39,         1,               0.963
  2496,   2055,      1,         0,               1.049
  2496,   2055,      1,         1,               1.049
  2496,   2049,      7,         0,               0.953
  2496,   2049,      7,         1,               0.953
  2560,      0,      0,         0,               0.991
  2560,      0,      0,         1,               0.991
  2560,      8,      0,         0,               1.031
  2560,      8,      0,         1,               1.032
  2560,     40,      0,         0,               1.029
  2560,     40,      0,         1,               1.029
  2560,      0,      8,         0,               0.992
  2560,      0,      8,         1,               0.992
  2560,      0,     40,         0,               0.975
  2560,      0,     40,         1,               0.984
  2560,      8,      8,         0,               0.942
  2560,      8,      8,         1,               0.943
  2560,     40,     40,         0,               1.139
  2560,     40,     40,         1,               1.139
  2560,   2048,      0,         0,               0.993
  2560,   2048,      0,         1,               0.993
  2560,   2056,      0,         0,               1.032
  2560,   2056,      0,         1,               1.032
  2560,   2048,      8,         0,               0.812
  2560,   2048,      8,         1,               0.812
  2560,   2056,      8,         0,               0.912
  2560,   2056,      8,         1,               0.912
  2560,      8,      1,         0,               1.068
  2560,      8,      1,         1,               1.069
  2560,      1,      8,         0,               0.974
  2560,      1,      8,         1,               0.974
  2560,     40,      1,         0,               1.068
  2560,     40,      1,         1,               1.068
  2560,      1,     40,         0,               0.996
  2560,      1,     40,         1,               0.996
  2560,   2056,      1,         0,               1.063
  2560,   2056,      1,         1,               1.063
  2560,   2049,      8,         0,               0.969
  2560,   2049,      8,         1,               0.969
  2624,      0,      0,         0,               0.995
  2624,      0,      0,         1,               0.994
  2624,      9,      0,         0,               1.015
  2624,      9,      0,         1,               1.018
  2624,     41,      0,         0,               1.044
  2624,     41,      0,         1,               1.044
  2624,      0,      9,         0,               0.988
  2624,      0,      9,         1,                0.99
  2624,      0,     41,         0,               0.989
  2624,      0,     41,         1,                0.99
  2624,      9,      9,         0,               0.943
  2624,      9,      9,         1,               0.943
  2624,     41,     41,         0,               0.993
  2624,     41,     41,         1,               0.993
  2624,   2048,      0,         0,               0.998
  2624,   2048,      0,         1,               0.998
  2624,   2057,      0,         0,               1.018
  2624,   2057,      0,         1,               1.018
  2624,   2048,      9,         0,                0.81
  2624,   2048,      9,         1,                0.81
  2624,   2057,      9,         0,               0.907
  2624,   2057,      9,         1,               0.907
  2624,      9,      1,         0,                1.09
  2624,      9,      1,         1,                1.09
  2624,      1,      9,         0,               0.967
  2624,      1,      9,         1,               0.967
  2624,     41,      1,         0,               1.084
  2624,     41,      1,         1,               1.085
  2624,      1,     41,         0,               0.958
  2624,      1,     41,         1,               0.957
  2624,   2057,      1,         0,               1.087
  2624,   2057,      1,         1,               1.087
  2624,   2049,      9,         0,               0.965
  2624,   2049,      9,         1,               0.965
  2688,      0,      0,         0,               0.995
  2688,      0,      0,         1,               0.995
  2688,     10,      0,         0,                1.01
  2688,     10,      0,         1,               1.012
  2688,     42,      0,         0,               1.036
  2688,     42,      0,         1,               1.036
  2688,      0,     10,         0,               0.978
  2688,      0,     10,         1,               0.979
  2688,      0,     42,         0,               0.977
  2688,      0,     42,         1,               0.978
  2688,     10,     10,         0,               0.942
  2688,     10,     10,         1,               0.942
  2688,     42,     42,         0,               0.989
  2688,     42,     42,         1,               0.989
  2688,   2048,      0,         0,               0.995
  2688,   2048,      0,         1,               0.995
  2688,   2058,      0,         0,               1.012
  2688,   2058,      0,         1,               1.012
  2688,   2048,     10,         0,               0.804
  2688,   2048,     10,         1,               0.804
  2688,   2058,     10,         0,               0.905
  2688,   2058,     10,         1,               0.905
  2688,     10,      1,         0,               0.986
  2688,     10,      1,         1,               0.987
  2688,      1,     10,         0,               0.893
  2688,      1,     10,         1,               0.894
  2688,     42,      1,         0,               1.054
  2688,     42,      1,         1,               1.054
  2688,      1,     42,         0,               0.958
  2688,      1,     42,         1,               0.958
  2688,   2058,      1,         0,               1.052
  2688,   2058,      1,         1,               1.052
  2688,   2049,     10,         0,               0.954
  2688,   2049,     10,         1,               0.954
  2752,      0,      0,         0,                 1.0
  2752,      0,      0,         1,               0.992
  2752,     11,      0,         0,               0.954
  2752,     11,      0,         1,               0.954
  2752,     43,      0,         0,               0.979
  2752,     43,      0,         1,               0.979
  2752,      0,     11,         0,               0.939
  2752,      0,     11,         1,               0.939
  2752,      0,     43,         0,               0.931
  2752,      0,     43,         1,               0.932
  2752,     11,     11,         0,               0.949
  2752,     11,     11,         1,               0.949
  2752,     43,     43,         0,               1.007
  2752,     43,     43,         1,               1.007
  2752,   2048,      0,         0,               0.993
  2752,   2048,      0,         1,               0.993
  2752,   2059,      0,         0,               0.954
  2752,   2059,      0,         1,               0.954
  2752,   2048,     11,         0,                0.77
  2752,   2048,     11,         1,                0.77
  2752,   2059,     11,         0,               0.916
  2752,   2059,     11,         1,               0.916
  2752,     11,      1,         0,               0.994
  2752,     11,      1,         1,               0.994
  2752,      1,     11,         0,               0.928
  2752,      1,     11,         1,               0.928
  2752,     43,      1,         0,               1.022
  2752,     43,      1,         1,               1.022
  2752,      1,     43,         0,                0.92
  2752,      1,     43,         1,                0.92
  2752,   2059,      1,         0,               0.989
  2752,   2059,      1,         1,               0.989
  2752,   2049,     11,         0,               0.923
  2752,   2049,     11,         1,               0.923
  2816,      0,      0,         0,               1.003
  2816,      0,      0,         1,               1.003
  2816,     12,      0,         0,               0.897
  2816,     12,      0,         1,               0.894
  2816,     44,      0,         0,               0.914
  2816,     44,      0,         1,               0.914
  2816,      0,     12,         0,               0.876
  2816,      0,     12,         1,               0.874
  2816,      0,     44,         0,               0.871
  2816,      0,     44,         1,                0.87
  2816,     12,     12,         0,               0.948
  2816,     12,     12,         1,               0.948
  2816,     44,     44,         0,               1.009
  2816,     44,     44,         1,               1.009
  2816,   2048,      0,         0,               1.005
  2816,   2048,      0,         1,               1.005
  2816,   2060,      0,         0,               0.894
  2816,   2060,      0,         1,               0.894
  2816,   2048,     12,         0,               0.714
  2816,   2048,     12,         1,               0.713
  2816,   2060,     12,         0,               0.915
  2816,   2060,     12,         1,               0.915
  2816,     12,      1,         0,               0.917
  2816,     12,      1,         1,               0.917
  2816,      1,     12,         0,               0.858
  2816,      1,     12,         1,               0.857
  2816,     44,      1,         0,               0.944
  2816,     44,      1,         1,               0.943
  2816,      1,     44,         0,               0.856
  2816,      1,     44,         1,               0.856
  2816,   2060,      1,         0,               0.914
  2816,   2060,      1,         1,               0.914
  2816,   2049,     12,         0,               0.855
  2816,   2049,     12,         1,               0.855
  2880,      0,      0,         0,               0.989
  2880,      0,      0,         1,               0.989
  2880,     13,      0,         0,               0.967
  2880,     13,      0,         1,               0.967
  2880,     45,      0,         0,               0.987
  2880,     45,      0,         1,               0.987
  2880,      0,     13,         0,               0.925
  2880,      0,     13,         1,               0.925
  2880,      0,     45,         0,               0.927
  2880,      0,     45,         1,               0.927
  2880,     13,     13,         0,               0.944
  2880,     13,     13,         1,               0.944
  2880,     45,     45,         0,               1.003
  2880,     45,     45,         1,               1.003
  2880,   2048,      0,         0,               0.989
  2880,   2048,      0,         1,               0.989
  2880,   2061,      0,         0,               0.967
  2880,   2061,      0,         1,               0.967
  2880,   2048,     13,         0,                0.76
  2880,   2048,     13,         1,                0.76
  2880,   2061,     13,         0,                0.91
  2880,   2061,     13,         1,                0.91
  2880,     13,      1,         0,               0.922
  2880,     13,      1,         1,               0.922
  2880,      1,     13,         0,               0.859
  2880,      1,     13,         1,               0.859
  2880,     45,      1,         0,               1.013
  2880,     45,      1,         1,               1.013
  2880,      1,     45,         0,                0.92
  2880,      1,     45,         1,                0.92
  2880,   2061,      1,         0,               0.984
  2880,   2061,      1,         1,               0.984
  2880,   2049,     13,         0,               0.918
  2880,   2049,     13,         1,               0.918
  2944,      0,      0,         0,               1.014
  2944,      0,      0,         1,               1.014
  2944,     14,      0,         0,               0.956
  2944,     14,      0,         1,               0.955
  2944,     46,      0,         0,               0.979
  2944,     46,      0,         1,               0.979
  2944,      0,     14,         0,               0.937
  2944,      0,     14,         1,               0.937
  2944,      0,     46,         0,                0.93
  2944,      0,     46,         1,                0.93
  2944,     14,     14,         0,               0.953
  2944,     14,     14,         1,               0.953
  2944,     46,     46,         0,               1.009
  2944,     46,     46,         1,               1.009
  2944,   2048,      0,         0,               1.015
  2944,   2048,      0,         1,               1.015
  2944,   2062,      0,         0,               0.955
  2944,   2062,      0,         1,               0.955
  2944,   2048,     14,         0,               0.769
  2944,   2048,     14,         1,               0.769
  2944,   2062,     14,         0,               0.923
  2944,   2062,     14,         1,               0.923
  2944,     14,      1,         0,               0.994
  2944,     14,      1,         1,               0.994
  2944,      1,     14,         0,               0.927
  2944,      1,     14,         1,               0.927
  2944,     46,      1,         0,               1.021
  2944,     46,      1,         1,               1.021
  2944,      1,     46,         0,               0.923
  2944,      1,     46,         1,               0.923
  2944,   2062,      1,         0,               0.988
  2944,   2062,      1,         1,               0.988
  2944,   2049,     14,         0,               0.922
  2944,   2049,     14,         1,               0.922
  3008,      0,      0,         0,               0.994
  3008,      0,      0,         1,               0.994
  3008,     15,      0,         0,               0.941
  3008,     15,      0,         1,               0.941
  3008,     47,      0,         0,               0.996
  3008,     47,      0,         1,               0.996
  3008,      0,     15,         0,               0.929
  3008,      0,     15,         1,               0.933
  3008,      0,     47,         0,               0.933
  3008,      0,     47,         1,               0.933
  3008,     15,     15,         0,               0.952
  3008,     15,     15,         1,               0.949
  3008,     47,     47,         0,               1.003
  3008,     47,     47,         1,               1.003
  3008,   2048,      0,         0,               0.998
  3008,   2048,      0,         1,               0.998
  3008,   2063,      0,         0,               0.941
  3008,   2063,      0,         1,               0.941
  3008,   2048,     15,         0,               0.766
  3008,   2048,     15,         1,               0.766
  3008,   2063,     15,         0,               0.916
  3008,   2063,     15,         1,               0.916
  3008,     15,      1,         0,               0.985
  3008,     15,      1,         1,               0.985
  3008,      1,     15,         0,               0.916
  3008,      1,     15,         1,               0.916
  3008,     47,      1,         0,               1.014
  3008,     47,      1,         1,               1.014
  3008,      1,     47,         0,               0.902
  3008,      1,     47,         1,               0.902
  3008,   2063,      1,         0,               0.981
  3008,   2063,      1,         1,               0.981
  3008,   2049,     15,         0,               0.912
  3008,   2049,     15,         1,               0.913
  3072,      0,      0,         0,               1.016
  3072,      0,      0,         1,               1.015
  3072,     16,      0,         0,               1.045
  3072,     16,      0,         1,               1.045
  3072,     48,      0,         0,               1.045
  3072,     48,      0,         1,               1.045
  3072,      0,     16,         0,               1.049
  3072,      0,     16,         1,               1.049
  3072,      0,     48,         0,               1.049
  3072,      0,     48,         1,               1.049
  3072,     16,     16,         0,               1.016
  3072,     16,     16,         1,               1.016
  3072,     48,     48,         0,               1.016
  3072,     48,     48,         1,               1.016
  3072,   2048,      0,         0,               1.016
  3072,   2048,      0,         1,               1.016
  3072,   2064,      0,         0,               1.045
  3072,   2064,      0,         1,               1.045
  3072,   2048,     16,         0,               1.049
  3072,   2048,     16,         1,               1.049
  3072,   2064,     16,         0,               1.016
  3072,   2064,     16,         1,               1.016
  3072,     16,      1,         0,               0.815
  3072,     16,      1,         1,               0.815
  3072,      1,     16,         0,               0.872
  3072,      1,     16,         1,               0.872
  3072,     48,      1,         0,               1.017
  3072,     48,      1,         1,               1.017
  3072,      1,     48,         0,               0.872
  3072,      1,     48,         1,               0.872
  3072,   2064,      1,         0,               0.815
  3072,   2064,      1,         1,               0.815
  3072,   2049,     16,         0,               0.872
  3072,   2049,     16,         1,               0.872
  3136,      0,      0,         0,               0.995
  3136,      0,      0,         1,               0.995
  3136,     17,      0,         0,               0.949
  3136,     17,      0,         1,               0.949
  3136,     49,      0,         0,               0.987
  3136,     49,      0,         1,               0.987
  3136,      0,     17,         0,               0.919
  3136,      0,     17,         1,               0.917
  3136,      0,     49,         0,               0.931
  3136,      0,     49,         1,               0.931
  3136,     17,     17,         0,               1.122
  3136,     17,     17,         1,               1.119
  3136,     49,     49,         0,               0.987
  3136,     49,     49,         1,               0.987
  3136,   2048,      0,         0,               0.997
  3136,   2048,      0,         1,               0.997
  3136,   2065,      0,         0,               0.949
  3136,   2065,      0,         1,               0.949
  3136,   2048,     17,         0,               0.896
  3136,   2048,     17,         1,               0.896
  3136,   2065,     17,         0,               1.122
  3136,   2065,     17,         1,               1.119
  3136,     17,      1,         0,               1.184
  3136,     17,      1,         1,               1.184
  3136,      1,     17,         0,               1.124
  3136,      1,     17,         1,               1.125
  3136,     49,      1,         0,                1.11
  3136,     49,      1,         1,               1.108
  3136,      1,     49,         0,               1.044
  3136,      1,     49,         1,               1.044
  3136,   2065,      1,         0,               1.147
  3136,   2065,      1,         1,               1.147
  3136,   2049,     17,         0,               1.102
  3136,   2049,     17,         1,                 1.1
  3200,      0,      0,         0,               1.006
  3200,      0,      0,         1,               1.006
  3200,     18,      0,         0,               0.978
  3200,     18,      0,         1,               0.978
  3200,     50,      0,         0,               0.998
  3200,     50,      0,         1,               0.998
  3200,      0,     18,         0,               0.932
  3200,      0,     18,         1,               0.932
  3200,      0,     50,         0,                0.93
  3200,      0,     50,         1,                0.93
  3200,     18,     18,         0,                1.11
  3200,     18,     18,         1,                1.11
  3200,     50,     50,         0,               0.994
  3200,     50,     50,         1,               0.994
  3200,   2048,      0,         0,               1.007
  3200,   2048,      0,         1,               1.007
  3200,   2066,      0,         0,               0.978
  3200,   2066,      0,         1,               0.978
  3200,   2048,     18,         0,               0.894
  3200,   2048,     18,         1,               0.894
  3200,   2066,     18,         0,                1.11
  3200,   2066,     18,         1,                1.11
  3200,     18,      1,         0,               1.002
  3200,     18,      1,         1,               1.002
  3200,      1,     18,         0,               0.917
  3200,      1,     18,         1,               0.917
  3200,     50,      1,         0,               0.963
  3200,     50,      1,         1,               0.964
  3200,      1,     50,         0,               0.888
  3200,      1,     50,         1,               0.888
  3200,   2066,      1,         0,               1.002
  3200,   2066,      1,         1,               1.002
  3200,   2049,     18,         0,               0.914
  3200,   2049,     18,         1,               0.914
  3264,      0,      0,         0,               0.994
  3264,      0,      0,         1,               0.994
  3264,     19,      0,         0,               0.959
  3264,     19,      0,         1,               0.959
  3264,     51,      0,         0,               0.994
  3264,     51,      0,         1,               0.994
  3264,      0,     19,         0,               0.927
  3264,      0,     19,         1,               0.927
  3264,      0,     51,         0,               0.927
  3264,      0,     51,         1,               0.927
  3264,     19,     19,         0,                 1.1
  3264,     19,     19,         1,                 1.1
  3264,     51,     51,         0,               0.982
  3264,     51,     51,         1,               0.982
  3264,   2048,      0,         0,               0.994
  3264,   2048,      0,         1,               0.994
  3264,   2067,      0,         0,               0.959
  3264,   2067,      0,         1,               0.959
  3264,   2048,     19,         0,               0.891
  3264,   2048,     19,         1,               0.891
  3264,   2067,     19,         0,               1.099
  3264,   2067,     19,         1,               1.099
  3264,     19,      1,         0,               0.977
  3264,     19,      1,         1,               0.976
  3264,      1,     19,         0,               0.921
  3264,      1,     19,         1,               0.921
  3264,     51,      1,         0,               0.959
  3264,     51,      1,         1,               0.959
  3264,      1,     51,         0,               0.886
  3264,      1,     51,         1,               0.886
  3264,   2067,      1,         0,               0.976
  3264,   2067,      1,         1,               0.976
  3264,   2049,     19,         0,               0.917
  3264,   2049,     19,         1,               0.917
  3328,      0,      0,         0,               0.996
  3328,      0,      0,         1,               0.992
  3328,     20,      0,         0,               0.955
  3328,     20,      0,         1,               0.955
  3328,     52,      0,         0,                0.99
  3328,     52,      0,         1,                0.99
  3328,      0,     20,         0,               0.926
  3328,      0,     20,         1,               0.923
  3328,      0,     52,         0,               0.933
  3328,      0,     52,         1,               0.933
  3328,     20,     20,         0,                1.11
  3328,     20,     20,         1,                1.11
  3328,     52,     52,         0,               0.988
  3328,     52,     52,         1,               0.988
  3328,   2048,      0,         0,               0.993
  3328,   2048,      0,         1,               0.993
  3328,   2068,      0,         0,               0.955
  3328,   2068,      0,         1,               0.955
  3328,   2048,     20,         0,                 0.9
  3328,   2048,     20,         1,                 0.9
  3328,   2068,     20,         0,               1.109
  3328,   2068,     20,         1,               1.109
  3328,     20,      1,         0,                0.99
  3328,     20,      1,         1,                0.99
  3328,      1,     20,         0,               0.922
  3328,      1,     20,         1,               0.922
  3328,     52,      1,         0,               0.972
  3328,     52,      1,         1,               0.972
  3328,      1,     52,         0,               0.901
  3328,      1,     52,         1,               0.901
  3328,   2068,      1,         0,                0.99
  3328,   2068,      1,         1,                0.99
  3328,   2049,     20,         0,               0.918
  3328,   2049,     20,         1,               0.918
  3392,      0,      0,         0,               0.998
  3392,      0,      0,         1,                 1.0
  3392,     21,      0,         0,               0.964
  3392,     21,      0,         1,               0.964
  3392,     53,      0,         0,               0.998
  3392,     53,      0,         1,               0.998
  3392,      0,     21,         0,               0.932
  3392,      0,     21,         1,               0.932
  3392,      0,     53,         0,                0.93
  3392,      0,     53,         1,                0.93
  3392,     21,     21,         0,               1.113
  3392,     21,     21,         1,               1.113
  3392,     53,     53,         0,               0.983
  3392,     53,     53,         1,               0.983
  3392,   2048,      0,         0,                 1.0
  3392,   2048,      0,         1,                 1.0
  3392,   2069,      0,         0,               0.964
  3392,   2069,      0,         1,               0.964
  3392,   2048,     21,         0,               0.895
  3392,   2048,     21,         1,               0.896
  3392,   2069,     21,         0,               1.113
  3392,   2069,     21,         1,               1.113
  3392,     21,      1,         0,               0.994
  3392,     21,      1,         1,               0.994
  3392,      1,     21,         0,               0.923
  3392,      1,     21,         1,               0.923
  3392,     53,      1,         0,               0.972
  3392,     53,      1,         1,               0.972
  3392,      1,     53,         0,               0.891
  3392,      1,     53,         1,               0.891
  3392,   2069,      1,         0,               0.994
  3392,   2069,      1,         1,               0.994
  3392,   2049,     21,         0,               0.922
  3392,   2049,     21,         1,               0.922
  3456,      0,      0,         0,               0.995
  3456,      0,      0,         1,               0.995
  3456,     22,      0,         0,               0.965
  3456,     22,      0,         1,               0.965
  3456,     54,      0,         0,               0.996
  3456,     54,      0,         1,               0.996
  3456,      0,     22,         0,               0.927
  3456,      0,     22,         1,               0.927
  3456,      0,     54,         0,               0.927
  3456,      0,     54,         1,               0.927
  3456,     22,     22,         0,               1.107
  3456,     22,     22,         1,               1.107
  3456,     54,     54,         0,                0.98
  3456,     54,     54,         1,                0.98
  3456,   2048,      0,         0,               0.995
  3456,   2048,      0,         1,               0.995
  3456,   2070,      0,         0,               0.965
  3456,   2070,      0,         1,               0.965
  3456,   2048,     22,         0,               0.893
  3456,   2048,     22,         1,               0.893
  3456,   2070,     22,         0,               1.107
  3456,   2070,     22,         1,               1.107
  3456,     22,      1,         0,               0.988
  3456,     22,      1,         1,               0.988
  3456,      1,     22,         0,               0.921
  3456,      1,     22,         1,               0.921
  3456,     54,      1,         0,               0.963
  3456,     54,      1,         1,               0.963
  3456,      1,     54,         0,               0.887
  3456,      1,     54,         1,               0.887
  3456,   2070,      1,         0,               0.988
  3456,   2070,      1,         1,               0.988
  3456,   2049,     22,         0,               0.917
  3456,   2049,     22,         1,               0.917
  3520,      0,      0,         0,               1.016
  3520,      0,      0,         1,               1.016
  3520,     23,      0,         0,               0.957
  3520,     23,      0,         1,               0.957
  3520,     55,      0,         0,               0.991
  3520,     55,      0,         1,               0.991
  3520,      0,     23,         0,               0.919
  3520,      0,     23,         1,               0.924
  3520,      0,     55,         0,               0.934
  3520,      0,     55,         1,               0.934
  3520,     23,     23,         0,               1.111
  3520,     23,     23,         1,               1.111
  3520,     55,     55,         0,               0.994
  3520,     55,     55,         1,               0.994
  3520,   2048,      0,         0,               1.016
  3520,   2048,      0,         1,               1.016
  3520,   2071,      0,         0,               0.957
  3520,   2071,      0,         1,               0.957
  3520,   2048,     23,         0,               0.903
  3520,   2048,     23,         1,               0.903
  3520,   2071,     23,         0,               1.111
  3520,   2071,     23,         1,               1.111
  3520,     23,      1,         0,               0.997
  3520,     23,      1,         1,               0.997
  3520,      1,     23,         0,               0.921
  3520,      1,     23,         1,               0.921
  3520,     55,      1,         0,               0.976
  3520,     55,      1,         1,               0.976
  3520,      1,     55,         0,               0.902
  3520,      1,     55,         1,               0.902
  3520,   2071,      1,         0,               0.997
  3520,   2071,      1,         1,               0.997
  3520,   2049,     23,         0,               0.918
  3520,   2049,     23,         1,               0.918
  3584,      0,      0,         0,               1.004
  3584,      0,      0,         1,               1.004
  3584,     24,      0,         0,               0.985
  3584,     24,      0,         1,               0.979
  3584,     56,      0,         0,               1.006
  3584,     56,      0,         1,               1.006
  3584,      0,     24,         0,               0.931
  3584,      0,     24,         1,               0.931
  3584,      0,     56,         0,                0.93
  3584,      0,     56,         1,                0.93
  3584,     24,     24,         0,               1.111
  3584,     24,     24,         1,                1.11
  3584,     56,     56,         0,               1.101
  3584,     56,     56,         1,                 1.1
  3584,   2048,      0,         0,               1.005
  3584,   2048,      0,         1,               1.005
  3584,   2072,      0,         0,                0.98
  3584,   2072,      0,         1,               0.978
  3584,   2048,     24,         0,               0.896
  3584,   2048,     24,         1,               0.897
  3584,   2072,     24,         0,               1.111
  3584,   2072,     24,         1,               1.111
  3584,     24,      1,         0,               1.004
  3584,     24,      1,         1,               1.004
  3584,      1,     24,         0,               0.921
  3584,      1,     24,         1,               0.921
  3584,     56,      1,         0,               0.971
  3584,     56,      1,         1,                0.97
  3584,      1,     56,         0,                0.89
  3584,      1,     56,         1,                0.89
  3584,   2072,      1,         0,               1.004
  3584,   2072,      1,         1,               1.004
  3584,   2049,     24,         0,               0.918
  3584,   2049,     24,         1,               0.918
  3648,      0,      0,         0,               1.012
  3648,      0,      0,         1,               1.012
  3648,     25,      0,         0,                0.96
  3648,     25,      0,         1,                0.96
  3648,     57,      0,         0,               0.988
  3648,     57,      0,         1,               0.988
  3648,      0,     25,         0,               0.927
  3648,      0,     25,         1,               0.927
  3648,      0,     57,         0,               0.927
  3648,      0,     57,         1,               0.927
  3648,     25,     25,         0,               1.101
  3648,     25,     25,         1,               1.101
  3648,     57,     57,         0,               0.986
  3648,     57,     57,         1,               0.986
  3648,   2048,      0,         0,               1.012
  3648,   2048,      0,         1,               1.012
  3648,   2073,      0,         0,                0.96
  3648,   2073,      0,         1,               0.959
  3648,   2048,     25,         0,               0.894
  3648,   2048,     25,         1,               0.895
  3648,   2073,     25,         0,               1.103
  3648,   2073,     25,         1,               1.103
  3648,     25,      1,         0,               1.024
  3648,     25,      1,         1,               1.024
  3648,      1,     25,         0,               0.911
  3648,      1,     25,         1,               0.912
  3648,     57,      1,         0,               0.973
  3648,     57,      1,         1,               0.974
  3648,      1,     57,         0,               0.888
  3648,      1,     57,         1,               0.888
  3648,   2073,      1,         0,               1.024
  3648,   2073,      1,         1,               1.024
  3648,   2049,     25,         0,               0.907
  3648,   2049,     25,         1,               0.907
  3712,      0,      0,         0,               0.996
  3712,      0,      0,         1,               0.996
  3712,     26,      0,         0,                0.96
  3712,     26,      0,         1,                0.96
  3712,     58,      0,         0,               0.995
  3712,     58,      0,         1,               0.995
  3712,      0,     26,         0,               0.919
  3712,      0,     26,         1,               0.918
  3712,      0,     58,         0,                0.93
  3712,      0,     58,         1,                0.93
  3712,     26,     26,         0,               1.103
  3712,     26,     26,         1,               1.102
  3712,     58,     58,         0,               0.989
  3712,     58,     58,         1,               0.989
  3712,   2048,      0,         0,               0.997
  3712,   2048,      0,         1,               0.997
  3712,   2074,      0,         0,               0.959
  3712,   2074,      0,         1,               0.959
  3712,   2048,     26,         0,               0.901
  3712,   2048,     26,         1,               0.901
  3712,   2074,     26,         0,               1.104
  3712,   2074,     26,         1,               1.102
  3712,     26,      1,         0,               1.001
  3712,     26,      1,         1,               1.001
  3712,      1,     26,         0,               0.922
  3712,      1,     26,         1,               0.922
  3712,     58,      1,         0,               0.974
  3712,     58,      1,         1,               0.974
  3712,      1,     58,         0,               0.903
  3712,      1,     58,         1,               0.903
  3712,   2074,      1,         0,               1.001
  3712,   2074,      1,         1,               1.001
  3712,   2049,     26,         0,               0.919
  3712,   2049,     26,         1,               0.919
  3776,      0,      0,         0,               1.003
  3776,      0,      0,         1,               1.003
  3776,     27,      0,         0,               0.964
  3776,     27,      0,         1,               0.964
  3776,     59,      0,         0,               1.004
  3776,     59,      0,         1,               1.004
  3776,      0,     27,         0,               0.931
  3776,      0,     27,         1,               0.931
  3776,      0,     59,         0,               0.929
  3776,      0,     59,         1,                0.93
  3776,     27,     27,         0,               1.097
  3776,     27,     27,         1,               1.097
  3776,     59,     59,         0,               0.992
  3776,     59,     59,         1,               0.992
  3776,   2048,      0,         0,               1.003
  3776,   2048,      0,         1,               1.003
  3776,   2075,      0,         0,               0.963
  3776,   2075,      0,         1,               0.964
  3776,   2048,     27,         0,               0.898
  3776,   2048,     27,         1,               0.898
  3776,   2075,     27,         0,               1.097
  3776,   2075,     27,         1,               1.097
  3776,     27,      1,         0,               0.998
  3776,     27,      1,         1,               0.998
  3776,      1,     27,         0,               0.925
  3776,      1,     27,         1,               0.925
  3776,     59,      1,         0,               0.979
  3776,     59,      1,         1,               0.979
  3776,      1,     59,         0,               0.894
  3776,      1,     59,         1,               0.894
  3776,   2075,      1,         0,               0.998
  3776,   2075,      1,         1,               0.999
  3776,   2049,     27,         0,               0.923
  3776,   2049,     27,         1,               0.923
  3840,      0,      0,         0,               0.997
  3840,      0,      0,         1,               0.997
  3840,     28,      0,         0,               0.968
  3840,     28,      0,         1,               0.968
  3840,     60,      0,         0,               1.001
  3840,     60,      0,         1,               1.001
  3840,      0,     28,         0,               0.926
  3840,      0,     28,         1,               0.927
  3840,      0,     60,         0,               0.927
  3840,      0,     60,         1,               0.927
  3840,     28,     28,         0,               1.094
  3840,     28,     28,         1,               1.094
  3840,     60,     60,         0,               0.982
  3840,     60,     60,         1,               0.982
  3840,   2048,      0,         0,               0.998
  3840,   2048,      0,         1,               0.998
  3840,   2076,      0,         0,               0.968
  3840,   2076,      0,         1,               0.968
  3840,   2048,     28,         0,               0.896
  3840,   2048,     28,         1,               0.896
  3840,   2076,     28,         0,               1.094
  3840,   2076,     28,         1,               1.094
  3840,     28,      1,         0,               0.983
  3840,     28,      1,         1,               0.982
  3840,      1,     28,         0,               0.916
  3840,      1,     28,         1,               0.916
  3840,     60,      1,         0,               0.969
  3840,     60,      1,         1,               0.969
  3840,      1,     60,         0,               0.891
  3840,      1,     60,         1,               0.891
  3840,   2076,      1,         0,               0.983
  3840,   2076,      1,         1,               0.983
  3840,   2049,     28,         0,               0.912
  3840,   2049,     28,         1,               0.912
  3904,      0,      0,         0,               1.002
  3904,      0,      0,         1,                 1.0
  3904,     29,      0,         0,               0.961
  3904,     29,      0,         1,               0.961
  3904,     61,      0,         0,               0.997
  3904,     61,      0,         1,               0.997
  3904,      0,     29,         0,               0.915
  3904,      0,     29,         1,               0.922
  3904,      0,     61,         0,               0.933
  3904,      0,     61,         1,               0.933
  3904,     29,     29,         0,               1.103
  3904,     29,     29,         1,               1.103
  3904,     61,     61,         0,               0.995
  3904,     61,     61,         1,               0.995
  3904,   2048,      0,         0,               0.998
  3904,   2048,      0,         1,                 1.0
  3904,   2077,      0,         0,               0.961
  3904,   2077,      0,         1,               0.961
  3904,   2048,     29,         0,               0.904
  3904,   2048,     29,         1,               0.904
  3904,   2077,     29,         0,               1.103
  3904,   2077,     29,         1,               1.103
  3904,     29,      1,         0,                 1.0
  3904,     29,      1,         1,                 1.0
  3904,      1,     29,         0,               0.922
  3904,      1,     29,         1,               0.922
  3904,     61,      1,         0,                0.98
  3904,     61,      1,         1,                0.98
  3904,      1,     61,         0,               0.904
  3904,      1,     61,         1,               0.904
  3904,   2077,      1,         0,                 1.0
  3904,   2077,      1,         1,                 1.0
  3904,   2049,     29,         0,               0.919
  3904,   2049,     29,         1,               0.919
  3968,      0,      0,         0,               1.003
  3968,      0,      0,         1,               1.003
  3968,     30,      0,         0,               0.969
  3968,     30,      0,         1,               0.969
  3968,     62,      0,         0,               1.006
  3968,     62,      0,         1,               1.006
  3968,      0,     30,         0,               0.931
  3968,      0,     30,         1,                0.93
  3968,      0,     62,         0,               0.929
  3968,      0,     62,         1,               0.929
  3968,     30,     30,         0,               1.103
  3968,     30,     30,         1,               1.103
  3968,     62,     62,         0,                0.99
  3968,     62,     62,         1,                0.99
  3968,   2048,      0,         0,               1.004
  3968,   2048,      0,         1,               1.004
  3968,   2078,      0,         0,               0.969
  3968,   2078,      0,         1,               0.969
  3968,   2048,     30,         0,               0.899
  3968,   2048,     30,         1,               0.899
  3968,   2078,     30,         0,               1.105
  3968,   2078,     30,         1,               1.105
  3968,     30,      1,         0,               0.993
  3968,     30,      1,         1,               0.993
  3968,      1,     30,         0,               0.908
  3968,      1,     30,         1,               0.908
  3968,     62,      1,         0,               0.978
  3968,     62,      1,         1,               0.978
  3968,      1,     62,         0,               0.895
  3968,      1,     62,         1,               0.895
  3968,   2078,      1,         0,               0.993
  3968,   2078,      1,         1,               0.993
  3968,   2049,     30,         0,               0.904
  3968,   2049,     30,         1,               0.904
  4032,      0,      0,         0,               0.995
  4032,      0,      0,         1,               0.995
  4032,     31,      0,         0,               0.967
  4032,     31,      0,         1,               0.967
  4032,     63,      0,         0,               1.002
  4032,     63,      0,         1,               1.002
  4032,      0,     31,         0,               0.927
  4032,      0,     31,         1,               0.926
  4032,      0,     63,         0,               0.927
  4032,      0,     63,         1,               0.927
  4032,     31,     31,         0,                1.09
  4032,     31,     31,         1,                1.09
  4032,     63,     63,         0,               0.987
  4032,     63,     63,         1,               0.987
  4032,   2048,      0,         0,               0.995
  4032,   2048,      0,         1,               0.995
  4032,   2079,      0,         0,               0.967
  4032,   2079,      0,         1,               0.967
  4032,   2048,     31,         0,               0.897
  4032,   2048,     31,         1,               0.897
  4032,   2079,     31,         0,                1.09
  4032,   2079,     31,         1,                1.09
  4032,     31,      1,         0,               0.989
  4032,     31,      1,         1,               0.989
  4032,      1,     31,         0,               0.911
  4032,      1,     31,         1,               0.911
  4032,     63,      1,         0,               0.971
  4032,     63,      1,         1,               0.972
  4032,      1,     63,         0,               0.892
  4032,      1,     63,         1,               0.892
  4032,   2079,      1,         0,               0.989
  4032,   2079,      1,         1,               0.989
  4032,   2049,     31,         0,               0.907
  4032,   2049,     31,         1,               0.907
  4096,     32,      0,         0,               1.014
  4096,     32,      0,         1,               1.014
  4096,     64,      0,         0,               1.014
  4096,     64,      0,         1,               1.014
  4096,      0,     32,         0,               1.012
  4096,      0,     32,         1,               1.012
  4096,      0,     64,         0,               1.012
  4096,      0,     64,         1,               1.012
  4096,     32,     32,         0,               1.014
  4096,     32,     32,         1,               1.014
  4096,     64,     64,         0,               1.014
  4096,     64,     64,         1,               1.014
  4096,   2080,      0,         0,               1.014
  4096,   2080,      0,         1,               1.014
  4096,   2048,     32,         0,               1.014
  4096,   2048,     32,         1,               1.014
  4096,   2080,     32,         0,               1.014
  4096,   2080,     32,         1,               1.014
  4096,     32,      1,         0,               0.975
  4096,     32,      1,         1,               0.975
  4096,      1,     32,         0,               0.769
  4096,      1,     32,         1,               0.769
  4096,     64,      1,         0,               0.858
  4096,     64,      1,         1,               0.858
  4096,      1,     64,         0,               0.769
  4096,      1,     64,         1,               0.769
  4096,   2080,      1,         0,               0.829
  4096,   2080,      1,         1,               0.829
  4096,   2049,     32,         0,               0.886
  4096,   2049,     32,         1,               0.886
  4160,      0,      0,         0,               1.003
  4160,      0,      0,         1,               1.003
  4160,     33,      0,         0,               1.004
  4160,     33,      0,         1,               1.004
  4160,     65,      0,         0,               0.999
  4160,     65,      0,         1,               0.999
  4160,      0,     33,         0,               0.931
  4160,      0,     33,         1,               0.931
  4160,      0,     65,         0,               0.765
  4160,      0,     65,         1,               0.765
  4160,     33,     33,         0,               0.998
  4160,     33,     33,         1,               0.998
  4160,     65,     65,         0,               0.942
  4160,     65,     65,         1,               0.942
  4160,   2048,      0,         0,               1.003
  4160,   2048,      0,         1,               1.003
  4160,   2081,      0,         0,               1.004
  4160,   2081,      0,         1,               1.004
  4160,   2048,     33,         0,               0.899
  4160,   2048,     33,         1,               0.898
  4160,   2081,     33,         0,               1.002
  4160,   2081,     33,         1,               1.002
  4160,     33,      1,         0,               1.114
  4160,     33,      1,         1,               1.114
  4160,      1,     33,         0,                1.01
  4160,      1,     33,         1,                1.01
  4160,     65,      1,         0,               1.077
  4160,     65,      1,         1,               1.077
  4160,      1,     65,         0,               0.935
  4160,      1,     65,         1,               0.935
  4160,   2081,      1,         0,               1.077
  4160,   2081,      1,         1,               1.077
  4160,   2049,     33,         0,               1.007
  4160,   2049,     33,         1,               1.007
  4224,      0,      0,         0,               1.014
  4224,      0,      0,         1,               1.014
  4224,     34,      0,         0,                 1.0
  4224,     34,      0,         1,                 1.0
  4224,     66,      0,         0,               1.001
  4224,     66,      0,         1,               1.001
  4224,      0,     34,         0,               0.928
  4224,      0,     34,         1,               0.928
  4224,      0,     66,         0,               0.762
  4224,      0,     66,         1,               0.762
  4224,     34,     34,         0,               0.998
  4224,     34,     34,         1,               0.998
  4224,     66,     66,         0,               0.959
  4224,     66,     66,         1,               0.959
  4224,   2048,      0,         0,               1.014
  4224,   2048,      0,         1,               1.014
  4224,   2082,      0,         0,               1.001
  4224,   2082,      0,         1,               1.001
  4224,   2048,     34,         0,               0.899
  4224,   2048,     34,         1,               0.898
  4224,   2082,     34,         0,               0.998
  4224,   2082,     34,         1,               0.998
  4224,     34,      1,         0,               1.024
  4224,     34,      1,         1,               1.023
  4224,      1,     34,         0,               0.917
  4224,      1,     34,         1,               0.917
  4224,     66,      1,         0,               1.012
  4224,     66,      1,         1,               1.013
  4224,      1,     66,         0,               0.917
  4224,      1,     66,         1,               0.917
  4224,   2082,      1,         0,               1.022
  4224,   2082,      1,         1,               1.022
  4224,   2049,     34,         0,               0.914
  4224,   2049,     34,         1,               0.914
  4288,      0,      0,         0,               0.999
  4288,      0,      0,         1,               0.999
  4288,     35,      0,         0,               0.995
  4288,     35,      0,         1,               0.996
  4288,     67,      0,         0,               0.998
  4288,     67,      0,         1,               0.998
  4288,      0,     35,         0,               0.919
  4288,      0,     35,         1,               0.918
  4288,      0,     67,         0,               0.767
  4288,      0,     67,         1,               0.767
  4288,     35,     35,         0,               1.005
  4288,     35,     35,         1,               1.004
  4288,     67,     67,         0,               0.995
  4288,     67,     67,         1,               0.995
  4288,   2048,      0,         0,               0.999
  4288,   2048,      0,         1,               0.999
  4288,   2083,      0,         0,               0.995
  4288,   2083,      0,         1,               0.995
  4288,   2048,     35,         0,               0.905
  4288,   2048,     35,         1,               0.904
  4288,   2083,     35,         0,               1.005
  4288,   2083,     35,         1,               1.004
  4288,     35,      1,         0,               1.033
  4288,     35,      1,         1,               1.032
  4288,      1,     35,         0,               0.928
  4288,      1,     35,         1,               0.928
  4288,     67,      1,         0,               1.019
  4288,     67,      1,         1,                1.02
  4288,      1,     67,         0,               0.925
  4288,      1,     67,         1,               0.924
  4288,   2083,      1,         0,                1.03
  4288,   2083,      1,         1,                1.03
  4288,   2049,     35,         0,               0.925
  4288,   2049,     35,         1,               0.926
  4352,      0,      0,         0,               1.005
  4352,      0,      0,         1,               1.005
  4352,     36,      0,         0,               1.007
  4352,     36,      0,         1,               1.006
  4352,     68,      0,         0,               1.007
  4352,     68,      0,         1,               1.008
  4352,      0,     36,         0,               0.929
  4352,      0,     36,         1,               0.929
  4352,      0,     68,         0,               0.766
  4352,      0,     68,         1,               0.766
  4352,     36,     36,         0,               0.998
  4352,     36,     36,         1,               0.998
  4352,     68,     68,         0,               0.964
  4352,     68,     68,         1,               0.964
  4352,   2048,      0,         0,               1.006
  4352,   2048,      0,         1,               1.006
  4352,   2084,      0,         0,               1.006
  4352,   2084,      0,         1,               1.006
  4352,   2048,     36,         0,               0.897
  4352,   2048,     36,         1,               0.898
  4352,   2084,     36,         0,               0.998
  4352,   2084,     36,         1,               0.998
  4352,     36,      1,         0,               1.031
  4352,     36,      1,         1,               1.031
  4352,      1,     36,         0,               0.924
  4352,      1,     36,         1,               0.924
  4352,     68,      1,         0,               0.999
  4352,     68,      1,         1,               0.999
  4352,      1,     68,         0,               0.922
  4352,      1,     68,         1,               0.922
  4352,   2084,      1,         0,                1.03
  4352,   2084,      1,         1,                1.03
  4352,   2049,     36,         0,               0.922
  4352,   2049,     36,         1,               0.922
  4416,      0,      0,         0,               0.997
  4416,      0,      0,         1,               0.997
  4416,     37,      0,         0,               1.002
  4416,     37,      0,         1,               1.002
  4416,     69,      0,         0,               1.004
  4416,     69,      0,         1,               1.004
  4416,      0,     37,         0,               0.928
  4416,      0,     37,         1,               0.927
  4416,      0,     69,         0,               0.762
  4416,      0,     69,         1,               0.762
  4416,     37,     37,         0,               0.994
  4416,     37,     37,         1,               0.994
  4416,     69,     69,         0,               0.959
  4416,     69,     69,         1,               0.959
  4416,   2048,      0,         0,               0.997
  4416,   2048,      0,         1,               0.997
  4416,   2085,      0,         0,               1.001
  4416,   2085,      0,         1,               1.001
  4416,   2048,     37,         0,               0.899
  4416,   2048,     37,         1,               0.899
  4416,   2085,     37,         0,               0.994
  4416,   2085,     37,         1,               0.994
  4416,     37,      1,         0,               1.024
  4416,     37,      1,         1,               1.023
  4416,      1,     37,         0,               0.923
  4416,      1,     37,         1,               0.922
  4416,     69,      1,         0,               1.009
  4416,     69,      1,         1,                1.01
  4416,      1,     69,         0,               0.917
  4416,      1,     69,         1,               0.917
  4416,   2085,      1,         0,               1.024
  4416,   2085,      1,         1,               1.024
  4416,   2049,     37,         0,               0.919
  4416,   2049,     37,         1,               0.919
  4480,      0,      0,         0,                 1.0
  4480,      0,      0,         1,               0.999
  4480,     38,      0,         0,               0.996
  4480,     38,      0,         1,               0.996
  4480,     70,      0,         0,                 1.0
  4480,     70,      0,         1,                 1.0
  4480,      0,     38,         0,               0.919
  4480,      0,     38,         1,               0.921
  4480,      0,     70,         0,               0.767
  4480,      0,     70,         1,               0.767
  4480,     38,     38,         0,               1.002
  4480,     38,     38,         1,               1.002
  4480,     70,     70,         0,               0.963
  4480,     70,     70,         1,               0.963
  4480,   2048,      0,         0,               0.998
  4480,   2048,      0,         1,               0.999
  4480,   2086,      0,         0,               0.996
  4480,   2086,      0,         1,               0.995
  4480,   2048,     38,         0,               0.907
  4480,   2048,     38,         1,               0.907
  4480,   2086,     38,         0,               1.002
  4480,   2086,     38,         1,               1.002
  4480,     38,      1,         0,               1.032
  4480,     38,      1,         1,               1.031
  4480,      1,     38,         0,               0.919
  4480,      1,     38,         1,                0.92
  4480,     70,      1,         0,               1.018
  4480,     70,      1,         1,               1.017
  4480,      1,     70,         0,               0.916
  4480,      1,     70,         1,               0.915
  4480,   2086,      1,         0,               1.031
  4480,   2086,      1,         1,                1.03
  4480,   2049,     38,         0,               0.917
  4480,   2049,     38,         1,               0.918
  4544,      0,      0,         0,               1.002
  4544,      0,      0,         1,               1.002
  4544,     39,      0,         0,               1.007
  4544,     39,      0,         1,               1.008
  4544,     71,      0,         0,               1.002
  4544,     71,      0,         1,               1.002
  4544,      0,     39,         0,                0.93
  4544,      0,     39,         1,               0.931
  4544,      0,     71,         0,               0.766
  4544,      0,     71,         1,               0.766
  4544,     39,     39,         0,               1.001
  4544,     39,     39,         1,               1.001
  4544,     71,     71,         0,               0.966
  4544,     71,     71,         1,               0.966
  4544,   2048,      0,         0,               1.002
  4544,   2048,      0,         1,               1.002
  4544,   2087,      0,         0,               1.008
  4544,   2087,      0,         1,               1.007
  4544,   2048,     39,         0,               0.901
  4544,   2048,     39,         1,               0.901
  4544,   2087,     39,         0,               1.001
  4544,   2087,     39,         1,               1.001
  4544,     39,      1,         0,               1.025
  4544,     39,      1,         1,               1.025
  4544,      1,     39,         0,               0.919
  4544,      1,     39,         1,               0.919
  4544,     71,      1,         0,               0.991
  4544,     71,      1,         1,               0.991
  4544,      1,     71,         0,               0.921
  4544,      1,     71,         1,               0.922
  4544,   2087,      1,         0,               1.025
  4544,   2087,      1,         1,               1.025
  4544,   2049,     39,         0,               0.917
  4544,   2049,     39,         1,               0.917
  4608,      0,      0,         0,               0.997
  4608,      0,      0,         1,               0.997
  4608,     40,      0,         0,               1.013
  4608,     40,      0,         1,               1.013
  4608,     72,      0,         0,               1.013
  4608,     72,      0,         1,               1.013
  4608,      0,     40,         0,               0.925
  4608,      0,     40,         1,               0.926
  4608,      0,     72,         0,               0.765
  4608,      0,     72,         1,               0.765
  4608,     40,     40,         0,               1.084
  4608,     40,     40,         1,               1.084
  4608,     72,     72,         0,               0.966
  4608,     72,     72,         1,               0.966
  4608,   2048,      0,         0,               0.999
  4608,   2048,      0,         1,               0.999
  4608,   2088,      0,         0,               1.012
  4608,   2088,      0,         1,               1.012
  4608,   2048,     40,         0,               0.898
  4608,   2048,     40,         1,               0.898
  4608,   2088,     40,         0,               1.087
  4608,   2088,     40,         1,               1.087
  4608,     40,      1,         0,               1.006
  4608,     40,      1,         1,               1.006
  4608,      1,     40,         0,               0.926
  4608,      1,     40,         1,               0.925
  4608,     72,      1,         0,               1.012
  4608,     72,      1,         1,               1.011
  4608,      1,     72,         0,                0.92
  4608,      1,     72,         1,                0.92
  4608,   2088,      1,         0,               1.006
  4608,   2088,      1,         1,               1.006
  4608,   2049,     40,         0,               0.923
  4608,   2049,     40,         1,               0.923
  4672,      0,      0,         0,               1.014
  4672,      0,      0,         1,               1.014
  4672,     41,      0,         0,               1.003
  4672,     41,      0,         1,               1.003
  4672,     73,      0,         0,               0.983
  4672,     73,      0,         1,               0.982
  4672,      0,     41,         0,               0.916
  4672,      0,     41,         1,               0.918
  4672,      0,     73,         0,               0.772
  4672,      0,     73,         1,               0.772
  4672,     41,     41,         0,               1.012
  4672,     41,     41,         1,               1.012
  4672,     73,     73,         0,               0.973
  4672,     73,     73,         1,               0.973
  4672,   2048,      0,         0,               1.014
  4672,   2048,      0,         1,               1.014
  4672,   2089,      0,         0,               1.002
  4672,   2089,      0,         1,               1.002
  4672,   2048,     41,         0,               0.907
  4672,   2048,     41,         1,               0.908
  4672,   2089,     41,         0,               1.012
  4672,   2089,     41,         1,               1.012
  4672,     41,      1,         0,               1.027
  4672,     41,      1,         1,               1.027
  4672,      1,     41,         0,               0.928
  4672,      1,     41,         1,               0.927
  4672,     73,      1,         0,               1.032
  4672,     73,      1,         1,                1.03
  4672,      1,     73,         0,               0.927
  4672,      1,     73,         1,               0.927
  4672,   2089,      1,         0,               1.026
  4672,   2089,      1,         1,               1.027
  4672,   2049,     41,         0,               0.925
  4672,   2049,     41,         1,               0.925
  4736,      0,      0,         0,               1.005
  4736,      0,      0,         1,               1.005
  4736,     42,      0,         0,               1.012
  4736,     42,      0,         1,               1.012
  4736,     74,      0,         0,               0.976
  4736,     74,      0,         1,               0.975
  4736,      0,     42,         0,                0.93
  4736,      0,     42,         1,                0.93
  4736,      0,     74,         0,                0.77
  4736,      0,     74,         1,                0.77
  4736,     42,     42,         0,               1.007
  4736,     42,     42,         1,               1.007
  4736,     74,     74,         0,               0.965
  4736,     74,     74,         1,               0.965
  4736,   2048,      0,         0,               1.006
  4736,   2048,      0,         1,               1.006
  4736,   2090,      0,         0,               1.013
  4736,   2090,      0,         1,               1.013
  4736,   2048,     42,         0,               0.902
  4736,   2048,     42,         1,               0.902
  4736,   2090,     42,         0,               1.007
  4736,   2090,     42,         1,               1.007
  4736,     42,      1,         0,               1.032
  4736,     42,      1,         1,               1.032
  4736,      1,     42,         0,               0.925
  4736,      1,     42,         1,               0.925
  4736,     74,      1,         0,               1.018
  4736,     74,      1,         1,               1.018
  4736,      1,     74,         0,               0.912
  4736,      1,     74,         1,               0.912
  4736,   2090,      1,         0,               1.032
  4736,   2090,      1,         1,               1.032
  4736,   2049,     42,         0,               0.923
  4736,   2049,     42,         1,               0.923
  4800,      0,      0,         0,               1.012
  4800,      0,      0,         1,               1.012
  4800,     43,      0,         0,               1.008
  4800,     43,      0,         1,               1.008
  4800,     75,      0,         0,                0.99
  4800,     75,      0,         1,                0.99
  4800,      0,     43,         0,               0.928
  4800,      0,     43,         1,               0.928
  4800,      0,     75,         0,               0.767
  4800,      0,     75,         1,               0.768
  4800,     43,     43,         0,               1.004
  4800,     43,     43,         1,               1.004
  4800,     75,     75,         0,               0.965
  4800,     75,     75,         1,               0.965
  4800,   2048,      0,         0,               1.012
  4800,   2048,      0,         1,               1.012
  4800,   2091,      0,         0,               1.009
  4800,   2091,      0,         1,               1.008
  4800,   2048,     43,         0,               0.902
  4800,   2048,     43,         1,               0.902
  4800,   2091,     43,         0,               1.004
  4800,   2091,     43,         1,               1.004
  4800,     43,      1,         0,               1.026
  4800,     43,      1,         1,               1.025
  4800,      1,     43,         0,                0.91
  4800,      1,     43,         1,                0.91
  4800,     75,      1,         0,               0.992
  4800,     75,      1,         1,               0.992
  4800,      1,     75,         0,               0.921
  4800,      1,     75,         1,                0.92
  4800,   2091,      1,         0,               1.025
  4800,   2091,      1,         1,               1.025
  4800,   2049,     43,         0,               0.907
  4800,   2049,     43,         1,               0.907
  4864,      0,      0,         0,               0.998
  4864,      0,      0,         1,               0.998
  4864,     44,      0,         0,               1.003
  4864,     44,      0,         1,               1.004
  4864,     76,      0,         0,               0.987
  4864,     76,      0,         1,               0.987
  4864,      0,     44,         0,                0.92
  4864,      0,     44,         1,               0.921
  4864,      0,     76,         0,               0.933
  4864,      0,     76,         1,               0.932
  4864,     44,     44,         0,               1.006
  4864,     44,     44,         1,               1.004
  4864,     76,     76,         0,               0.976
  4864,     76,     76,         1,               0.975
  4864,   2048,      0,         0,               0.999
  4864,   2048,      0,         1,               0.999
  4864,   2092,      0,         0,               1.004
  4864,   2092,      0,         1,               1.005
  4864,   2048,     44,         0,               0.907
  4864,   2048,     44,         1,               0.907
  4864,   2092,     44,         0,               1.006
  4864,   2092,     44,         1,               1.005
  4864,     44,      1,         0,               1.034
  4864,     44,      1,         1,               1.032
  4864,      1,     44,         0,               0.908
  4864,      1,     44,         1,               0.929
  4864,     76,      1,         0,               1.006
  4864,     76,      1,         1,               1.005
  4864,      1,     76,         0,               0.798
  4864,      1,     76,         1,               0.798
  4864,   2092,      1,         0,               1.033
  4864,   2092,      1,         1,               1.033
  4864,   2049,     44,         0,               0.904
  4864,   2049,     44,         1,               0.925
  4928,      0,      0,         0,               1.005
  4928,      0,      0,         1,               1.005
  4928,     45,      0,         0,               0.993
  4928,     45,      0,         1,               1.012
  4928,     77,      0,         0,               0.956
  4928,     77,      0,         1,               0.976
  4928,      0,     45,         0,               0.933
  4928,      0,     45,         1,               0.932
  4928,      0,     77,         0,               0.771
  4928,      0,     77,         1,               0.771
  4928,     45,     45,         0,               1.015
  4928,     45,     45,         1,               1.015
  4928,     77,     77,         0,               0.972
  4928,     77,     77,         1,               0.972
  4928,   2048,      0,         0,               1.005
  4928,   2048,      0,         1,               1.005
  4928,   2093,      0,         0,               0.992
  4928,   2093,      0,         1,               1.012
  4928,   2048,     45,         0,               0.932
  4928,   2048,     45,         1,               0.931
  4928,   2093,     45,         0,               1.015
  4928,   2093,     45,         1,               1.015
  4928,     45,      1,         0,               1.009
  4928,     45,      1,         1,               1.032
  4928,      1,     45,         0,               0.806
  4928,      1,     45,         1,               0.805
  4928,     77,      1,         0,               0.981
  4928,     77,      1,         1,               1.005
  4928,      1,     77,         0,               0.917
  4928,      1,     77,         1,               0.917
  4928,   2093,      1,         0,               1.008
  4928,   2093,      1,         1,               1.032
  4928,   2049,     45,         0,               0.794
  4928,   2049,     45,         1,               0.794
  4992,      0,      0,         0,               0.999
  4992,      0,      0,         1,               0.999
  4992,     46,      0,         0,               0.985
  4992,     46,      0,         1,               1.008
  4992,     78,      0,         0,               0.963
  4992,     78,      0,         1,               0.984
  4992,      0,     46,         0,               0.908
  4992,      0,     46,         1,               0.908
  4992,      0,     78,         0,               0.752
  4992,      0,     78,         1,               0.751
  4992,     46,     46,         0,               0.997
  4992,     46,     46,         1,               0.997
  4992,     78,     78,         0,               0.969
  4992,     78,     78,         1,               0.968
  4992,   2048,      0,         0,                 1.0
  4992,   2048,      0,         1,                 1.0
  4992,   2094,      0,         0,               0.987
  4992,   2094,      0,         1,               1.008
  4992,   2048,     46,         0,               0.883
  4992,   2048,     46,         1,               0.883
  4992,   2094,     46,         0,               0.997
  4992,   2094,     46,         1,               0.997
  4992,     46,      1,         0,               0.998
  4992,     46,      1,         1,                1.02
  4992,      1,     46,         0,               0.917
  4992,      1,     46,         1,               0.917
  4992,     78,      1,         0,               0.972
  4992,     78,      1,         1,               0.993
  4992,      1,     78,         0,               0.919
  4992,      1,     78,         1,                0.92
  4992,   2094,      1,         0,               0.997
  4992,   2094,      1,         1,               1.019
  4992,   2049,     46,         0,               0.914
  4992,   2049,     46,         1,               0.914
  5056,      0,      0,         0,               1.002
  5056,      0,      0,         1,                 1.0
  5056,     47,      0,         0,               1.005
  5056,     47,      0,         1,               1.005
  5056,     79,      0,         0,               0.989
  5056,     79,      0,         1,               0.989
  5056,      0,     47,         0,               0.918
  5056,      0,     47,         1,               0.919
  5056,      0,     79,         0,               0.772
  5056,      0,     79,         1,               0.771
  5056,     47,     47,         0,               1.006
  5056,     47,     47,         1,               1.006
  5056,     79,     79,         0,               0.972
  5056,     79,     79,         1,               0.972
  5056,   2048,      0,         0,               1.001
  5056,   2048,      0,         1,                 1.0
  5056,   2095,      0,         0,               1.004
  5056,   2095,      0,         1,               1.004
  5056,   2048,     47,         0,               0.908
  5056,   2048,     47,         1,               0.909
  5056,   2095,     47,         0,               1.006
  5056,   2095,     47,         1,               1.006
  5056,     47,      1,         0,               1.033
  5056,     47,      1,         1,               1.033
  5056,      1,     47,         0,               0.919
  5056,      1,     47,         1,               0.919
  5056,     79,      1,         0,               1.003
  5056,     79,      1,         1,               1.005
  5056,      1,     79,         0,               0.921
  5056,      1,     79,         1,               0.921
  5056,   2095,      1,         0,               1.032
  5056,   2095,      1,         1,               1.034
  5056,   2049,     47,         0,               0.918
  5056,   2049,     47,         1,               0.917
  5120,      0,      0,         0,               1.003
  5120,      0,      0,         1,               1.003
  5120,     48,      0,         0,               1.068
  5120,     48,      0,         1,               1.068
  5120,     80,      0,         0,               1.068
  5120,     80,      0,         1,               1.068
  5120,      0,     48,         0,               1.065
  5120,      0,     48,         1,               1.065
  5120,      0,     80,         0,               1.064
  5120,      0,     80,         1,               1.065
  5120,     48,     48,         0,               1.004
  5120,     48,     48,         1,               1.004
  5120,     80,     80,         0,               1.005
  5120,     80,     80,         1,               1.005
  5120,   2048,      0,         0,               1.005
  5120,   2048,      0,         1,               1.005
  5120,   2096,      0,         0,               1.068
  5120,   2096,      0,         1,               1.068
  5120,   2048,     48,         0,               1.065
  5120,   2048,     48,         1,               1.065
  5120,   2096,     48,         0,               1.005
  5120,   2096,     48,         1,               1.005
  5120,     48,      1,         0,               1.033
  5120,     48,      1,         1,               1.031
  5120,      1,     48,         0,               0.898
  5120,      1,     48,         1,               0.898
  5120,     80,      1,         0,               0.844
  5120,     80,      1,         1,               0.844
  5120,      1,     80,         0,               0.898
  5120,      1,     80,         1,               0.898
  5120,   2096,      1,         0,               0.856
  5120,   2096,      1,         1,               0.855
  5120,   2049,     48,         0,               0.898
  5120,   2049,     48,         1,               0.898

bench-memcpy-random:
 length, New Time / Old Time
  32768,               0.866
  65536,               0.891
 131072,               0.896
 262144,               0.901
 524288,               0.904
1048576,               0.913

bench-memcpy-large:
  length, align0, align1, dst>src, New Time/Old Time
   65543,      0,      0,       0,             0.981
   65543,      0,      0,       1,             0.981
   65551,      0,      3,       0,             1.012
   65551,      0,      3,       1,             1.013
   65567,      3,      0,       0,             1.019
   65567,      3,      0,       1,              1.02
   65599,      3,      5,       0,             1.058
   65599,      3,      5,       1,             1.061
   65536,      0,    127,       0,             1.046
   65536,      0,    127,       1,             1.046
   65536,      0,    255,       0,             1.071
   65536,      0,    255,       1,             1.071
   65536,      0,    256,       0,             0.983
   65536,      0,    256,       1,             0.984
   65536,      0,   4064,       0,             1.017
   65536,      0,   4064,       1,             1.018
  131079,      0,      0,       0,             0.981
  131079,      0,      0,       1,             0.981
  131087,      0,      3,       0,             1.017
  131087,      0,      3,       1,             1.017
  131103,      3,      0,       0,             1.022
  131103,      3,      0,       1,             1.022
  131135,      3,      5,       0,             1.064
  131135,      3,      5,       1,             1.065
  131072,      0,    127,       0,              1.05
  131072,      0,    127,       1,              1.05
  131072,      0,    255,       0,             1.074
  131072,      0,    255,       1,             1.074
  131072,      0,    256,       0,             0.984
  131072,      0,    256,       1,             0.984
  131072,      0,   4064,       0,             1.018
  131072,      0,   4064,       1,             1.019
  262151,      0,      0,       0,             0.985
  262151,      0,      0,       1,             0.985
  262159,      0,      3,       0,             1.026
  262159,      0,      3,       1,             1.026
  262175,      3,      0,       0,              1.03
  262175,      3,      0,       1,              1.03
  262207,      3,      5,       0,              1.07
  262207,      3,      5,       1,              1.07
  262144,      0,    127,       0,             1.057
  262144,      0,    127,       1,             1.057
  262144,      0,    255,       0,             1.079
  262144,      0,    255,       1,             1.078
  262144,      0,    256,       0,             0.988
  262144,      0,    256,       1,             0.988
  262144,      0,   4064,       0,              1.02
  262144,      0,   4064,       1,              1.02
  524295,      0,      0,       0,             0.692
  524295,      0,      0,       1,             0.692
  524303,      0,      3,       0,             0.736
  524303,      0,      3,       1,             0.737
  524319,      3,      0,       0,             0.758
  524319,      3,      0,       1,             0.759
  524351,      3,      5,       0,             0.759
  524351,      3,      5,       1,             0.759
  524288,      0,    127,       0,             1.057
  524288,      0,    127,       1,             1.058
  524288,      0,    255,       0,             1.079
  524288,      0,    255,       1,             1.079
  524288,      0,    256,       0,             0.988
  524288,      0,    256,       1,             0.988
  524288,      0,   4064,       0,              1.02
  524288,      0,   4064,       1,              1.02
 1048583,      0,      0,       0,             0.948
 1048583,      0,      0,       1,             0.948
 1048591,      0,      3,       0,             0.735
 1048591,      0,      3,       1,             0.735
 1048607,      3,      0,       0,             0.757
 1048607,      3,      0,       1,             0.758
 1048639,      3,      5,       0,             0.758
 1048639,      3,      5,       1,             0.758
 1048576,      0,    127,       0,             0.761
 1048576,      0,    127,       1,             0.762
 1048576,      0,    255,       0,             0.751
 1048576,      0,    255,       1,             0.751
 1048576,      0,    256,       0,              0.93
 1048576,      0,    256,       1,              0.93
 1048576,      0,   4064,       0,              0.93
 1048576,      0,   4064,       1,              0.93
 2097159,      0,      0,       0,             0.928
 2097159,      0,      0,       1,             0.931
 2097167,      0,      3,       0,             0.735
 2097167,      0,      3,       1,             0.734
 2097183,      3,      0,       0,             0.759
 2097183,      3,      0,       1,             0.759
 2097215,      3,      5,       0,             0.758
 2097215,      3,      5,       1,             0.757
 2097152,      0,    127,       0,              0.77
 2097152,      0,    127,       1,              0.77
 2097152,      0,    255,       0,             0.745
 2097152,      0,    255,       1,             0.745
 2097152,      0,    256,       0,             0.924
 2097152,      0,    256,       1,             0.925
 2097152,      0,   4064,       0,             0.926
 2097152,      0,   4064,       1,             0.927
 4194311,      0,      0,       0,             0.894
 4194311,      0,      0,       1,             0.896
 4194319,      0,      3,       0,             0.752
 4194319,      0,      3,       1,             0.751
 4194335,      3,      0,       0,              0.82
 4194335,      3,      0,       1,             0.821
 4194367,      3,      5,       0,             0.788
 4194367,      3,      5,       1,             0.789
 4194304,      0,    127,       0,             0.801
 4194304,      0,    127,       1,             0.801
 4194304,      0,    255,       0,             0.802
 4194304,      0,    255,       1,             0.804
 4194304,      0,    256,       0,             0.873
 4194304,      0,    256,       1,             0.868
 4194304,      0,   4064,       0,             0.955
 4194304,      0,   4064,       1,             0.954
 8388615,      0,      0,       0,             0.885
 8388615,      0,      0,       1,             0.886
 8388623,      0,      3,       0,             0.769
 8388623,      0,      3,       1,             0.769
 8388639,      3,      0,       0,              0.87
 8388639,      3,      0,       1,              0.87
 8388671,      3,      5,       0,             0.811
 8388671,      3,      5,       1,             0.814
 8388608,      0,    127,       0,              0.83
 8388608,      0,    127,       1,              0.83
 8388608,      0,    255,       0,             0.857
 8388608,      0,    255,       1,             0.857
 8388608,      0,    256,       0,             0.851
 8388608,      0,    256,       1,             0.848
 8388608,      0,   4064,       0,             0.981
 8388608,      0,   4064,       1,             0.981
16777223,      0,      0,       0,             0.885
16777223,      0,      0,       1,             0.886
16777231,      0,      3,       0,             0.769
16777231,      0,      3,       1,             0.768
16777247,      3,      0,       0,              0.87
16777247,      3,      0,       1,              0.87
16777279,      3,      5,       0,             0.811
16777279,      3,      5,       1,             0.814
16777216,      0,    127,       0,             0.831
16777216,      0,    127,       1,              0.83
16777216,      0,    255,       0,             0.857
16777216,      0,    255,       1,             0.857
16777216,      0,    256,       0,             0.852
16777216,      0,    256,       1,             0.848
16777216,      0,   4064,       0,              0.98
16777216,      0,   4064,       1,             0.981
33554439,      0,      0,       0,             0.885
33554439,      0,      0,       1,             0.886
33554447,      0,      3,       0,             0.768
33554447,      0,      3,       1,             0.768
33554463,      3,      0,       0,             0.871
33554463,      3,      0,       1,              0.87
33554495,      3,      5,       0,             0.811
33554495,      3,      5,       1,             0.814
33554432,      0,    127,       0,             0.831
33554432,      0,    127,       1,             0.831
33554432,      0,    255,       0,             0.858
33554432,      0,    255,       1,             0.857
33554432,      0,    256,       0,             0.852
33554432,      0,    256,       1,             0.848
33554432,      0,   4064,       0,              0.98
33554432,      0,   4064,       1,             0.981

 sysdeps/x86_64/multiarch/Makefile        |    1 -
 sysdeps/x86_64/multiarch/memcpy-ssse3.S  | 3151 ----------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S |  386 ++-
 3 files changed, 382 insertions(+), 3156 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..84e4e0f6cb 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,382 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_15):
+	cmpl	$8, %edx
+	ja	L(copy_9_15)
+
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_9_15):
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	cmpl	$1, %edx
+	jl	L(copy_0_0)
+	movzbl	(%rsi), %ecx
+	je	L(copy_0_1)
+
+	movzwl	-2(%rsi, %rdx), %esi
+	movw	%si, -2(%rdi, %rdx)
+L(copy_0_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 20:44     ` Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
  4 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |    7 -
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 3209 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48f81711ae..323be3b969 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,14 +16,12 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 70b0e9c62e..d6852ab365 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1ecdd4b0d3..5596ddea2c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
-    {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-        return OPTIMIZE (ssse3_back);
-    }
-
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	return OPTIMIZE (sse2_unaligned_erms);
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 5/6] x86: Remove str{n}cat-ssse3
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-03-25 20:44     ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 20:44     ` Noah Goldstein
  2022-03-25 20:44     ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
  4 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
 sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
 sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
 5 files changed, 879 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 323be3b969..a2ebc06c5f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -59,7 +59,6 @@ sysdep_routines += \
   strcat-evex \
   strcat-sse2 \
   strcat-sse2-unaligned \
-  strcat-ssse3 \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
@@ -97,7 +96,6 @@ sysdep_routines += \
   strncat-c \
   strncat-evex \
   strncat-sse2-unaligned \
-  strncat-ssse3 \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d6852ab365..4133ed7e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcat_evex)
-	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
-			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
-	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
-			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
-   implementation gets merged.  */
-
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
-L(StartStrcpyPart):
-	mov	%rsi, %rcx
-	lea	(%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(StrncatExit0)
-	cmp	$8, %r8
-	jbe	L(StrncatExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	jb	L(StrncatExit15Bytes)
-# endif
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	je	L(StrncatExit16)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit1):
-	xor	%ah, %ah
-	movb	%ah, 1(%rdx)
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit2):
-	xor	%ah, %ah
-	movb	%ah, 2(%rdx)
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit3):
-	xor	%ah, %ah
-	movb	%ah, 3(%rdx)
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit4):
-	xor	%ah, %ah
-	movb	%ah, 4(%rdx)
-L(Exit4):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit5):
-	xor	%ah, %ah
-	movb	%ah, 5(%rdx)
-L(Exit5):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit6):
-	xor	%ah, %ah
-	movb	%ah, 6(%rdx)
-L(Exit6):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit7):
-	xor	%ah, %ah
-	movb	%ah, 7(%rdx)
-L(Exit7):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	3(%rcx), %eax
-	mov	%eax, 3(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8):
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-L(Exit8):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit9):
-	xor	%ah, %ah
-	movb	%ah, 9(%rdx)
-L(Exit9):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movb	8(%rcx), %al
-	movb	%al, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit10):
-	xor	%ah, %ah
-	movb	%ah, 10(%rdx)
-L(Exit10):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movw	8(%rcx), %ax
-	movw	%ax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit11):
-	xor	%ah, %ah
-	movb	%ah, 11(%rdx)
-L(Exit11):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit12):
-	xor	%ah, %ah
-	movb	%ah, 12(%rdx)
-L(Exit12):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit13):
-	xor	%ah, %ah
-	movb	%ah, 13(%rdx)
-L(Exit13):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	5(%rcx), %xmm1
-	movlpd	%xmm1, 5(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit14):
-	xor	%ah, %ah
-	movb	%ah, 14(%rdx)
-L(Exit14):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	6(%rcx), %xmm1
-	movlpd	%xmm1, 6(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15):
-	xor	%ah, %ah
-	movb	%ah, 15(%rdx)
-L(Exit15):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit16):
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-L(Exit16):
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-# ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase2):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$8, %r8
-	ja	L(ExitHighCase3)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase3):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit0):
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15Bytes):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8Bytes):
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-03-25 20:44     ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 20:44     ` Noah Goldstein
  4 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:44 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -79,7 +77,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -106,7 +103,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-03-25 19:55   ` H.J. Lu
  2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:42   ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
                     ` (6 subsequent siblings)
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
Full memcpy ssse3 results. Number are comparison of
geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz

bench-memcpy:

length, align1, align2, dst > src, New Time / Old Time
     1,      0,      0,         0,               2.099
     1,      0,      0,         1,               2.099
     1,     32,      0,         0,               2.103
     1,     32,      0,         1,               2.103
     1,      0,     32,         0,               2.099
     1,      0,     32,         1,               2.098
     1,     32,     32,         0,               2.098
     1,     32,     32,         1,               2.098
     1,   2048,      0,         0,               2.098
     1,   2048,      0,         1,               2.098
     2,      0,      0,         0,               1.135
     2,      0,      0,         1,               1.136
     2,      1,      0,         0,               1.139
     2,      1,      0,         1,               1.139
     2,     33,      0,         0,               1.165
     2,     33,      0,         1,               1.139
     2,      0,      1,         0,               1.136
     2,      0,      1,         1,               1.136
     2,      0,     33,         0,               1.136
     2,      0,     33,         1,               1.136
     2,      1,      1,         0,               1.136
     2,      1,      1,         1,               1.136
     2,     33,     33,         0,               1.136
     2,     33,     33,         1,               1.136
     2,   2048,      0,         0,               1.136
     2,   2048,      0,         1,               1.136
     2,   2049,      0,         0,               1.191
     2,   2049,      0,         1,               1.139
     2,   2048,      1,         0,               1.136
     2,   2048,      1,         1,               1.136
     2,   2049,      1,         0,               1.136
     2,   2049,      1,         1,               1.136
     4,      0,      0,         0,               1.074
     4,      0,      0,         1,               0.962
     4,      2,      0,         0,               0.973
     4,      2,      0,         1,               0.989
     4,     34,      0,         0,               0.991
     4,     34,      0,         1,               0.991
     4,      0,      2,         0,               0.962
     4,      0,      2,         1,               0.962
     4,      0,     34,         0,               0.962
     4,      0,     34,         1,               0.962
     4,      2,      2,         0,               0.962
     4,      2,      2,         1,               0.962
     4,     34,     34,         0,               0.962
     4,     34,     34,         1,               0.962
     4,   2048,      0,         0,               0.962
     4,   2048,      0,         1,               0.962
     4,   2050,      0,         0,               0.977
     4,   2050,      0,         1,               0.979
     4,   2048,      2,         0,               0.962
     4,   2048,      2,         1,               0.962
     4,   2050,      2,         0,               0.962
     4,   2050,      2,         1,               0.962
     8,      0,      0,         0,               0.961
     8,      0,      0,         1,               0.962
     8,      3,      0,         0,                 1.0
     8,      3,      0,         1,                 1.0
     8,     35,      0,         0,                 1.0
     8,     35,      0,         1,                 1.0
     8,      0,      3,         0,               0.962
     8,      0,      3,         1,               0.962
     8,      0,     35,         0,               0.962
     8,      0,     35,         1,               0.962
     8,      3,      3,         0,               0.962
     8,      3,      3,         1,               0.962
     8,     35,     35,         0,               0.962
     8,     35,     35,         1,               0.962
     8,   2048,      0,         0,               0.962
     8,   2048,      0,         1,               0.962
     8,   2051,      0,         0,                 1.0
     8,   2051,      0,         1,                 1.0
     8,   2048,      3,         0,               0.962
     8,   2048,      3,         1,               0.962
     8,   2051,      3,         0,               0.962
     8,   2051,      3,         1,               0.962
    16,      0,      0,         0,               0.798
    16,      0,      0,         1,               0.799
    16,      4,      0,         0,                 0.8
    16,      4,      0,         1,               0.801
    16,     36,      0,         0,               0.801
    16,     36,      0,         1,                 0.8
    16,      0,      4,         0,               0.798
    16,      0,      4,         1,               0.798
    16,      0,     36,         0,               0.798
    16,      0,     36,         1,               0.798
    16,      4,      4,         0,               0.798
    16,      4,      4,         1,               0.798
    16,     36,     36,         0,               0.798
    16,     36,     36,         1,               0.798
    16,   2048,      0,         0,               0.798
    16,   2048,      0,         1,               0.799
    16,   2052,      0,         0,                 0.8
    16,   2052,      0,         1,                 0.8
    16,   2048,      4,         0,               0.798
    16,   2048,      4,         1,               0.798
    16,   2052,      4,         0,               0.798
    16,   2052,      4,         1,               0.798
    32,      0,      0,         0,               0.471
    32,      0,      0,         1,               0.471
    32,      5,      0,         0,               0.471
    32,      5,      0,         1,               0.471
    32,     37,      0,         0,               0.961
    32,     37,      0,         1,               0.961
    32,      0,      5,         0,               0.471
    32,      0,      5,         1,               0.471
    32,      0,     37,         0,               1.021
    32,      0,     37,         1,               1.021
    32,      5,      5,         0,               0.471
    32,      5,      5,         1,               0.471
    32,     37,     37,         0,               1.011
    32,     37,     37,         1,               1.011
    32,   2048,      0,         0,               0.471
    32,   2048,      0,         1,               0.471
    32,   2053,      0,         0,               0.471
    32,   2053,      0,         1,               0.471
    32,   2048,      5,         0,               0.471
    32,   2048,      5,         1,               0.471
    32,   2053,      5,         0,               0.471
    32,   2053,      5,         1,               0.471
    64,      0,      0,         0,                 1.0
    64,      0,      0,         1,                 1.0
    64,      6,      0,         0,               0.862
    64,      6,      0,         1,               0.862
    64,     38,      0,         0,               0.912
    64,     38,      0,         1,               0.912
    64,      0,      6,         0,               0.896
    64,      0,      6,         1,               0.896
    64,      0,     38,         0,               0.906
    64,      0,     38,         1,               0.906
    64,      6,      6,         0,                0.91
    64,      6,      6,         1,                0.91
    64,     38,     38,         0,               0.883
    64,     38,     38,         1,               0.883
    64,   2048,      0,         0,                 1.0
    64,   2048,      0,         1,                 1.0
    64,   2054,      0,         0,               0.862
    64,   2054,      0,         1,               0.862
    64,   2048,      6,         0,               0.887
    64,   2048,      6,         1,               0.887
    64,   2054,      6,         0,               0.887
    64,   2054,      6,         1,               0.887
   128,      0,      0,         0,               0.857
   128,      0,      0,         1,               0.857
   128,      7,      0,         0,               0.875
   128,      7,      0,         1,               0.875
   128,     39,      0,         0,               0.892
   128,     39,      0,         1,               0.892
   128,      0,      7,         0,               1.183
   128,      0,      7,         1,               1.183
   128,      0,     39,         0,               1.113
   128,      0,     39,         1,               1.113
   128,      7,      7,         0,               0.692
   128,      7,      7,         1,               0.692
   128,     39,     39,         0,               1.104
   128,     39,     39,         1,               1.104
   128,   2048,      0,         0,               0.857
   128,   2048,      0,         1,               0.857
   128,   2055,      0,         0,               0.875
   128,   2055,      0,         1,               0.875
   128,   2048,      7,         0,               0.959
   128,   2048,      7,         1,               0.959
   128,   2055,      7,         0,               1.036
   128,   2055,      7,         1,               1.036
   256,      0,      0,         0,               0.889
   256,      0,      0,         1,               0.889
   256,      8,      0,         0,               0.966
   256,      8,      0,         1,               0.966
   256,     40,      0,         0,               0.983
   256,     40,      0,         1,               0.983
   256,      0,      8,         0,                1.29
   256,      0,      8,         1,                1.29
   256,      0,     40,         0,               1.274
   256,      0,     40,         1,               1.274
   256,      8,      8,         0,               0.865
   256,      8,      8,         1,               0.865
   256,     40,     40,         0,               1.477
   256,     40,     40,         1,               1.477
   256,   2048,      0,         0,               0.889
   256,   2048,      0,         1,               0.889
   256,   2056,      0,         0,               0.966
   256,   2056,      0,         1,               0.966
   256,   2048,      8,         0,               0.952
   256,   2048,      8,         1,               0.952
   256,   2056,      8,         0,               0.878
   256,   2056,      8,         1,               0.878
   512,      0,      0,         0,               1.077
   512,      0,      0,         1,               1.077
   512,      9,      0,         0,               1.001
   512,      9,      0,         1,                 1.0
   512,     41,      0,         0,               0.954
   512,     41,      0,         1,               0.954
   512,      0,      9,         0,               1.191
   512,      0,      9,         1,               1.191
   512,      0,     41,         0,               1.181
   512,      0,     41,         1,               1.181
   512,      9,      9,         0,               0.765
   512,      9,      9,         1,               0.765
   512,     41,     41,         0,               0.905
   512,     41,     41,         1,               0.905
   512,   2048,      0,         0,               1.077
   512,   2048,      0,         1,               1.077
   512,   2057,      0,         0,                 1.0
   512,   2057,      0,         1,                 1.0
   512,   2048,      9,         0,                 1.0
   512,   2048,      9,         1,                 1.0
   512,   2057,      9,         0,               0.733
   512,   2057,      9,         1,               0.733
  1024,      0,      0,         0,               1.143
  1024,      0,      0,         1,               1.143
  1024,     10,      0,         0,               1.015
  1024,     10,      0,         1,               1.015
  1024,     42,      0,         0,               1.045
  1024,     42,      0,         1,               1.045
  1024,      0,     10,         0,               1.126
  1024,      0,     10,         1,               1.126
  1024,      0,     42,         0,               1.114
  1024,      0,     42,         1,               1.114
  1024,     10,     10,         0,                0.89
  1024,     10,     10,         1,                0.89
  1024,     42,     42,         0,               0.986
  1024,     42,     42,         1,               0.986
  1024,   2048,      0,         0,               1.143
  1024,   2048,      0,         1,               1.143
  1024,   2058,      0,         0,               1.015
  1024,   2058,      0,         1,               1.015
  1024,   2048,     10,         0,                1.03
  1024,   2048,     10,         1,                1.03
  1024,   2058,     10,         0,               0.854
  1024,   2058,     10,         1,               0.854
  2048,      0,      0,         0,               1.005
  2048,      0,      0,         1,               1.005
  2048,     11,      0,         0,               1.013
  2048,     11,      0,         1,               1.014
  2048,     43,      0,         0,               1.044
  2048,     43,      0,         1,               1.044
  2048,      0,     11,         0,               1.003
  2048,      0,     11,         1,               1.003
  2048,      0,     43,         0,               1.003
  2048,      0,     43,         1,               1.003
  2048,     11,     11,         0,                0.92
  2048,     11,     11,         1,                0.92
  2048,     43,     43,         0,                 1.0
  2048,     43,     43,         1,                 1.0
  2048,   2048,      0,         0,               1.005
  2048,   2048,      0,         1,               1.005
  2048,   2059,      0,         0,               0.904
  2048,   2059,      0,         1,               0.904
  2048,   2048,     11,         0,                 1.0
  2048,   2048,     11,         1,                 1.0
  2048,   2059,     11,         0,               0.979
  2048,   2059,     11,         1,               0.979
  4096,      0,      0,         0,               1.014
  4096,      0,      0,         1,               1.014
  4096,     12,      0,         0,               0.855
  4096,     12,      0,         1,               0.855
  4096,     44,      0,         0,               0.857
  4096,     44,      0,         1,               0.857
  4096,      0,     12,         0,               0.932
  4096,      0,     12,         1,               0.932
  4096,      0,     44,         0,               0.932
  4096,      0,     44,         1,               0.932
  4096,     12,     12,         0,               0.999
  4096,     12,     12,         1,               0.999
  4096,     44,     44,         0,               1.051
  4096,     44,     44,         1,               1.051
  4096,   2048,      0,         0,               1.014
  4096,   2048,      0,         1,               1.014
  4096,   2060,      0,         0,                0.98
  4096,   2060,      0,         1,                0.98
  4096,   2048,     12,         0,                0.77
  4096,   2048,     12,         1,                0.77
  4096,   2060,     12,         0,               0.943
  4096,   2060,     12,         1,               0.943
  8192,      0,      0,         0,               1.046
  8192,      0,      0,         1,               1.046
  8192,     13,      0,         0,               0.885
  8192,     13,      0,         1,               0.885
  8192,     45,      0,         0,               0.887
  8192,     45,      0,         1,               0.886
  8192,      0,     13,         0,               0.942
  8192,      0,     13,         1,               0.942
  8192,      0,     45,         0,               0.942
  8192,      0,     45,         1,               0.942
  8192,     13,     13,         0,                1.03
  8192,     13,     13,         1,                1.03
  8192,     45,     45,         0,               1.048
  8192,     45,     45,         1,               1.048
  8192,   2048,      0,         0,               1.048
  8192,   2048,      0,         1,               1.048
  8192,   2061,      0,         0,               1.011
  8192,   2061,      0,         1,               1.011
  8192,   2048,     13,         0,               0.789
  8192,   2048,     13,         1,               0.789
  8192,   2061,     13,         0,               0.991
  8192,   2061,     13,         1,               0.991
 16384,      0,      0,         0,               1.014
 16384,      0,      0,         1,               1.008
 16384,     14,      0,         0,               0.951
 16384,     14,      0,         1,                0.95
 16384,     46,      0,         0,               0.874
 16384,     46,      0,         1,               0.871
 16384,      0,     14,         0,               0.813
 16384,      0,     14,         1,                0.81
 16384,      0,     46,         0,                0.85
 16384,      0,     46,         1,                0.86
 16384,     14,     14,         0,               0.985
 16384,     14,     14,         1,               0.975
 16384,     46,     46,         0,               1.025
 16384,     46,     46,         1,               1.027
 16384,   2048,      0,         0,               1.058
 16384,   2048,      0,         1,               1.058
 16384,   2062,      0,         0,               0.849
 16384,   2062,      0,         1,               0.848
 16384,   2048,     14,         0,               0.907
 16384,   2048,     14,         1,               0.907
 16384,   2062,     14,         0,               0.988
 16384,   2062,     14,         1,               0.995
 32768,      0,      0,         0,               0.979
 32768,      0,      0,         1,               0.979
 32768,     15,      0,         0,               1.006
 32768,     15,      0,         1,               1.006
 32768,     47,      0,         0,               1.004
 32768,     47,      0,         1,               1.004
 32768,      0,     15,         0,               1.045
 32768,      0,     15,         1,               1.045
 32768,      0,     47,         0,               1.011
 32768,      0,     47,         1,               1.012
 32768,     15,     15,         0,               0.977
 32768,     15,     15,         1,               0.977
 32768,     47,     47,         0,                0.96
 32768,     47,     47,         1,                0.96
 32768,   2048,      0,         0,               0.978
 32768,   2048,      0,         1,               0.978
 32768,   2063,      0,         0,               1.004
 32768,   2063,      0,         1,               1.004
 32768,   2048,     15,         0,               1.036
 32768,   2048,     15,         1,               1.036
 32768,   2063,     15,         0,               0.978
 32768,   2063,     15,         1,               0.978
 65536,      0,      0,         0,               0.981
 65536,      0,      0,         1,               0.981
 65536,     16,      0,         0,               0.987
 65536,     16,      0,         1,               0.987
 65536,     48,      0,         0,               0.968
 65536,     48,      0,         1,               0.968
 65536,      0,     16,         0,               1.014
 65536,      0,     16,         1,               1.014
 65536,      0,     48,         0,               0.984
 65536,      0,     48,         1,               0.984
 65536,     16,     16,         0,                1.01
 65536,     16,     16,         1,                1.01
 65536,     48,     48,         0,               0.968
 65536,     48,     48,         1,               0.968
 65536,   2048,      0,         0,               0.982
 65536,   2048,      0,         1,               0.982
 65536,   2064,      0,         0,               0.987
 65536,   2064,      0,         1,               0.987
 65536,   2048,     16,         0,               1.012
 65536,   2048,     16,         1,               1.012
 65536,   2064,     16,         0,               1.007
 65536,   2064,     16,         1,               1.007
     0,      0,      0,         0,               2.104
     0,   2048,      0,         0,               2.104
     0,   4095,      0,         0,               2.109
     0,      0,   4095,         0,               2.103
     1,      1,      0,         0,               2.104
     1,      0,      1,         0,               2.098
     1,      1,      1,         0,               2.098
     1,   2049,      0,         0,               2.102
     1,   2048,      1,         0,               2.098
     1,   2049,      1,         0,               2.098
     1,   4095,      0,         0,               2.103
     1,      0,   4095,         0,               2.098
     2,      2,      0,         0,               1.139
     2,      0,      2,         0,               1.136
     2,      2,      2,         0,               1.136
     2,   2050,      0,         0,               1.139
     2,   2048,      2,         0,               1.136
     2,   2050,      2,         0,               1.136
     2,   4095,      0,         0,                 1.0
     2,      0,   4095,         0,               1.022
     3,      0,      0,         0,               0.981
     3,      3,      0,         0,               0.984
     3,      0,      3,         0,               0.982
     3,      3,      3,         0,               0.982
     3,   2048,      0,         0,               0.982
     3,   2051,      0,         0,               0.983
     3,   2048,      3,         0,               0.982
     3,   2051,      3,         0,               0.982
     3,   4095,      0,         0,               0.285
     3,      0,   4095,         0,               0.231
     4,      4,      0,         0,               1.373
     4,      0,      4,         0,                1.31
     4,      4,      4,         0,               1.282
     4,   2052,      0,         0,               1.264
     4,   2048,      4,         0,               1.254
     4,   2052,      4,         0,               1.254
     4,   4095,      0,         0,               1.971
     4,      0,   4095,         0,               1.994
     5,      0,      0,         0,               1.145
     5,      5,      0,         0,               1.155
     5,      0,      5,         0,               1.171
     5,      5,      5,         0,               1.171
     5,   2048,      0,         0,               1.197
     5,   2053,      0,         0,               1.173
     5,   2048,      5,         0,               1.171
     5,   2053,      5,         0,               1.171
     5,   4095,      0,         0,               0.935
     5,      0,   4095,         0,               1.017
     6,      0,      0,         0,               1.145
     6,      6,      0,         0,               1.098
     6,      0,      6,         0,               1.096
     6,      6,      6,         0,               1.096
     6,   2048,      0,         0,                1.12
     6,   2054,      0,         0,               1.122
     6,   2048,      6,         0,                1.12
     6,   2054,      6,         0,               1.096
     6,   4095,      0,         0,               0.935
     6,      0,   4095,         0,               1.018
     7,      0,      0,         0,               1.071
     7,      7,      0,         0,               1.074
     7,      0,      7,         0,               1.072
     7,      7,      7,         0,               1.072
     7,   2048,      0,         0,               1.096
     7,   2055,      0,         0,               1.098
     7,   2048,      7,         0,               1.096
     7,   2055,      7,         0,               1.096
     7,   4095,      0,         0,               0.935
     7,      0,   4095,         0,               1.016
     8,      8,      0,         0,               1.167
     8,      0,      8,         0,               1.028
     8,      8,      8,         0,               1.028
     8,   2056,      0,         0,               1.069
     8,   2048,      8,         0,               1.028
     8,   2056,      8,         0,               1.028
     8,   4095,      0,         0,               1.029
     8,      0,   4095,         0,               1.043
     9,      0,      0,         0,               0.799
     9,      9,      0,         0,               0.801
     9,      0,      9,         0,               0.799
     9,      9,      9,         0,               0.799
     9,   2048,      0,         0,                 0.8
     9,   2057,      0,         0,               0.801
     9,   2048,      9,         0,                 0.8
     9,   2057,      9,         0,               0.799
     9,   4095,      0,         0,               0.909
     9,      0,   4095,         0,                 1.0
    10,      0,      0,         0,               0.799
    10,     10,      0,         0,               0.801
    10,      0,     10,         0,                 0.8
    10,     10,     10,         0,                 0.8
    10,   2048,      0,         0,                 0.8
    10,   2058,      0,         0,               0.801
    10,   2048,     10,         0,                 0.8
    10,   2058,     10,         0,                 0.8
    10,   4095,      0,         0,               0.909
    10,      0,   4095,         0,                 1.0
    11,      0,      0,         0,               0.799
    11,     11,      0,         0,               0.801
    11,      0,     11,         0,                 0.8
    11,     11,     11,         0,                 0.8
    11,   2048,      0,         0,                 0.8
    11,   2059,      0,         0,               0.802
    11,   2048,     11,         0,                 0.8
    11,   2059,     11,         0,                 0.8
    11,   4095,      0,         0,               0.909
    11,      0,   4095,         0,                 1.0
    12,      0,      0,         0,               0.799
    12,     12,      0,         0,               0.801
    12,      0,     12,         0,                 0.8
    12,     12,     12,         0,                 0.8
    12,   2048,      0,         0,                 0.8
    12,   2060,      0,         0,               0.802
    12,   2048,     12,         0,                 0.8
    12,   2060,     12,         0,                 0.8
    12,   4095,      0,         0,               0.909
    12,      0,   4095,         0,                 1.0
    13,      0,      0,         0,               0.798
    13,     13,      0,         0,               0.801
    13,      0,     13,         0,               0.799
    13,     13,     13,         0,               0.799
    13,   2048,      0,         0,                 0.8
    13,   2061,      0,         0,               0.801
    13,   2048,     13,         0,                 0.8
    13,   2061,     13,         0,                 0.8
    13,   4095,      0,         0,               0.909
    13,      0,   4095,         0,                 1.0
    14,      0,      0,         0,               0.799
    14,     14,      0,         0,               0.801
    14,      0,     14,         0,                 0.8
    14,     14,     14,         0,                 0.8
    14,   2048,      0,         0,                 0.8
    14,   2062,      0,         0,               0.801
    14,   2048,     14,         0,                 0.8
    14,   2062,     14,         0,                 0.8
    14,   4095,      0,         0,               0.909
    14,      0,   4095,         0,                 1.0
    15,      0,      0,         0,               0.799
    15,     15,      0,         0,               0.801
    15,      0,     15,         0,                 0.8
    15,     15,     15,         0,                 0.8
    15,   2048,      0,         0,                 0.8
    15,   2063,      0,         0,               0.802
    15,   2048,     15,         0,                 0.8
    15,   2063,     15,         0,                 0.8
    15,   4095,      0,         0,               0.909
    15,      0,   4095,         0,                 1.0
    16,     16,      0,         0,               0.801
    16,      0,     16,         0,               0.799
    16,     16,     16,         0,               0.799
    16,   2064,      0,         0,               0.801
    16,   2048,     16,         0,               0.798
    16,   2064,     16,         0,               0.798
    16,   4095,      0,         0,               1.818
    16,      0,   4095,         0,               1.957
    17,      0,      0,         0,               0.798
    17,     17,      0,         0,                 0.8
    17,      0,     17,         0,               0.799
    17,     17,     17,         0,               0.798
    17,   2048,      0,         0,               0.798
    17,   2065,      0,         0,                 0.8
    17,   2048,     17,         0,               0.798
    17,   2065,     17,         0,               0.799
    17,   4095,      0,         0,               0.937
    17,      0,   4095,         0,               1.021
    18,      0,      0,         0,               0.798
    18,     18,      0,         0,               0.801
    18,      0,     18,         0,               0.798
    18,     18,     18,         0,               0.798
    18,   2048,      0,         0,               0.799
    18,   2066,      0,         0,                 0.8
    18,   2048,     18,         0,               0.798
    18,   2066,     18,         0,               0.798
    18,   4095,      0,         0,               0.937
    18,      0,   4095,         0,               1.021
    19,      0,      0,         0,               0.798
    19,     19,      0,         0,                 0.8
    19,      0,     19,         0,               0.798
    19,     19,     19,         0,               0.798
    19,   2048,      0,         0,               0.798
    19,   2067,      0,         0,                 0.8
    19,   2048,     19,         0,               0.798
    19,   2067,     19,         0,               0.798
    19,   4095,      0,         0,               0.937
    19,      0,   4095,         0,               1.021
    20,      0,      0,         0,               0.798
    20,     20,      0,         0,                 0.8
    20,      0,     20,         0,               0.798
    20,     20,     20,         0,               0.798
    20,   2048,      0,         0,               0.798
    20,   2068,      0,         0,                 0.8
    20,   2048,     20,         0,               0.798
    20,   2068,     20,         0,               0.798
    20,   4095,      0,         0,               0.937
    20,      0,   4095,         0,               1.021
    21,      0,      0,         0,               0.798
    21,     21,      0,         0,               0.801
    21,      0,     21,         0,               0.798
    21,     21,     21,         0,               0.798
    21,   2048,      0,         0,               0.798
    21,   2069,      0,         0,               0.801
    21,   2048,     21,         0,               0.799
    21,   2069,     21,         0,               0.798
    21,   4095,      0,         0,               0.937
    21,      0,   4095,         0,               1.021
    22,      0,      0,         0,               0.798
    22,     22,      0,         0,               0.801
    22,      0,     22,         0,               0.798
    22,     22,     22,         0,               0.798
    22,   2048,      0,         0,               0.798
    22,   2070,      0,         0,               0.801
    22,   2048,     22,         0,               0.798
    22,   2070,     22,         0,               0.798
    22,   4095,      0,         0,               0.937
    22,      0,   4095,         0,               1.021
    23,      0,      0,         0,               0.798
    23,     23,      0,         0,                 0.8
    23,      0,     23,         0,               0.798
    23,     23,     23,         0,               0.798
    23,   2048,      0,         0,               0.798
    23,   2071,      0,         0,                 0.8
    23,   2048,     23,         0,               0.798
    23,   2071,     23,         0,               0.798
    23,   4095,      0,         0,               0.937
    23,      0,   4095,         0,               1.021
    24,      0,      0,         0,               0.798
    24,     24,      0,         0,                 0.8
    24,      0,     24,         0,               0.799
    24,     24,     24,         0,               0.798
    24,   2048,      0,         0,               0.798
    24,   2072,      0,         0,               0.801
    24,   2048,     24,         0,               0.798
    24,   2072,     24,         0,               0.798
    24,   4095,      0,         0,               0.937
    24,      0,   4095,         0,               1.021
    25,      0,      0,         0,                 0.5
    25,     25,      0,         0,                 0.5
    25,      0,     25,         0,                 0.5
    25,     25,     25,         0,                 0.5
    25,   2048,      0,         0,                 0.5
    25,   2073,      0,         0,               0.501
    25,   2048,     25,         0,                 0.5
    25,   2073,     25,         0,                 0.5
    25,   4095,      0,         0,               0.974
    25,      0,   4095,         0,                0.98
    26,      0,      0,         0,                 0.5
    26,     26,      0,         0,               0.501
    26,      0,     26,         0,                 0.5
    26,     26,     26,         0,               0.501
    26,   2048,      0,         0,                 0.5
    26,   2074,      0,         0,                 0.5
    26,   2048,     26,         0,                 0.5
    26,   2074,     26,         0,                 0.5
    26,   4095,      0,         0,               0.974
    26,      0,   4095,         0,                 1.0
    27,      0,      0,         0,                 0.5
    27,     27,      0,         0,               0.501
    27,      0,     27,         0,                 0.5
    27,     27,     27,         0,                 0.5
    27,   2048,      0,         0,                 0.5
    27,   2075,      0,         0,                 0.5
    27,   2048,     27,         0,                 0.5
    27,   2075,     27,         0,                 0.5
    27,   4095,      0,         0,               0.974
    27,      0,   4095,         0,                 1.0
    28,      0,      0,         0,                 0.5
    28,     28,      0,         0,               0.501
    28,      0,     28,         0,                 0.5
    28,     28,     28,         0,                 0.5
    28,   2048,      0,         0,                 0.5
    28,   2076,      0,         0,                 0.5
    28,   2048,     28,         0,                 0.5
    28,   2076,     28,         0,                 0.5
    28,   4095,      0,         0,               0.974
    28,      0,   4095,         0,                 1.0
    29,      0,      0,         0,               0.471
    29,     29,      0,         0,               0.471
    29,      0,     29,         0,               0.471
    29,     29,     29,         0,               0.471
    29,   2048,      0,         0,               0.471
    29,   2077,      0,         0,               0.471
    29,   2048,     29,         0,               0.471
    29,   2077,     29,         0,               0.471
    29,   4095,      0,         0,               0.974
    29,      0,   4095,         0,                 1.0
    30,      0,      0,         0,               0.471
    30,     30,      0,         0,               0.471
    30,      0,     30,         0,               0.471
    30,     30,     30,         0,               0.471
    30,   2048,      0,         0,               0.471
    30,   2078,      0,         0,               0.471
    30,   2048,     30,         0,               0.471
    30,   2078,     30,         0,               0.471
    30,   4095,      0,         0,               0.974
    30,      0,   4095,         0,                 1.0
    31,      0,      0,         0,               0.471
    31,     31,      0,         0,               0.471
    31,      0,     31,         0,               0.471
    31,     31,     31,         0,               0.471
    31,   2048,      0,         0,               0.471
    31,   2079,      0,         0,               0.471
    31,   2048,     31,         0,               0.471
    31,   2079,     31,         0,               0.471
    31,   4095,      0,         0,               0.974
    31,      0,   4095,         0,                 1.0
    48,      0,      0,         0,                 1.0
    48,      0,      0,         1,                 1.0
    48,      3,      0,         0,                 1.0
    48,      3,      0,         1,                 1.0
    48,      0,      3,         0,                 1.0
    48,      0,      3,         1,                 1.0
    48,      3,      3,         0,                 1.0
    48,      3,      3,         1,                 1.0
    48,   2048,      0,         0,                 1.0
    48,   2048,      0,         1,                 1.0
    48,   2051,      0,         0,                 1.0
    48,   2051,      0,         1,                 1.0
    48,   2048,      3,         0,                 1.0
    48,   2048,      3,         1,                 1.0
    48,   2051,      3,         0,                 1.0
    48,   2051,      3,         1,                 1.0
    80,      0,      0,         0,               0.781
    80,      0,      0,         1,               0.782
    80,      5,      0,         0,               0.976
    80,      5,      0,         1,               0.976
    80,      0,      5,         0,               1.232
    80,      0,      5,         1,               1.232
    80,      5,      5,         0,               1.542
    80,      5,      5,         1,               1.543
    80,   2048,      0,         0,               0.781
    80,   2048,      0,         1,               0.782
    80,   2053,      0,         0,               0.976
    80,   2053,      0,         1,               0.976
    80,   2048,      5,         0,               1.093
    80,   2048,      5,         1,               1.093
    80,   2053,      5,         0,               1.371
    80,   2053,      5,         1,               1.371
    96,      0,      0,         0,               0.758
    96,      0,      0,         1,               0.758
    96,      6,      0,         0,               0.929
    96,      6,      0,         1,               0.929
    96,      0,      6,         0,               1.204
    96,      0,      6,         1,               1.204
    96,      6,      6,         0,               1.562
    96,      6,      6,         1,               1.562
    96,   2048,      0,         0,               0.758
    96,   2048,      0,         1,               0.758
    96,   2054,      0,         0,               0.929
    96,   2054,      0,         1,               0.929
    96,   2048,      6,         0,               1.068
    96,   2048,      6,         1,               1.068
    96,   2054,      6,         0,               1.562
    96,   2054,      6,         1,               1.562
   112,      0,      0,         0,               0.736
   112,      0,      0,         1,               0.736
   112,      7,      0,         0,               0.675
   112,      7,      0,         1,               0.675
   112,      0,      7,         0,               0.778
   112,      0,      7,         1,               0.778
   112,      7,      7,         0,               0.909
   112,      7,      7,         1,               0.909
   112,   2048,      0,         0,               0.736
   112,   2048,      0,         1,               0.736
   112,   2055,      0,         0,               0.675
   112,   2055,      0,         1,               0.675
   112,   2048,      7,         0,               0.778
   112,   2048,      7,         1,               0.778
   112,   2055,      7,         0,               0.909
   112,   2055,      7,         1,               0.909
   144,      0,      0,         0,               0.857
   144,      0,      0,         1,               0.857
   144,      9,      0,         0,               0.941
   144,      9,      0,         1,               0.943
   144,      0,      9,         0,               1.137
   144,      0,      9,         1,               1.137
   144,      9,      9,         0,               1.514
   144,      9,      9,         1,               1.514
   144,   2048,      0,         0,               0.857
   144,   2048,      0,         1,               0.857
   144,   2057,      0,         0,               0.939
   144,   2057,      0,         1,               0.945
   144,   2048,      9,         0,               0.922
   144,   2048,      9,         1,               0.922
   144,   2057,      9,         0,               1.514
   144,   2057,      9,         1,               1.514
   160,      0,      0,         0,               0.698
   160,      0,      0,         1,               0.698
   160,     10,      0,         0,                0.91
   160,     10,      0,         1,                0.91
   160,      0,     10,         0,               1.211
   160,      0,     10,         1,               1.212
   160,     10,     10,         0,               1.357
   160,     10,     10,         1,               1.357
   160,   2048,      0,         0,               0.698
   160,   2048,      0,         1,               0.698
   160,   2058,      0,         0,                0.91
   160,   2058,      0,         1,                0.91
   160,   2048,     10,         0,               0.923
   160,   2048,     10,         1,               0.923
   160,   2058,     10,         0,               1.357
   160,   2058,     10,         1,               1.357
   176,      0,      0,         0,               0.796
   176,      0,      0,         1,               0.796
   176,     11,      0,         0,               0.804
   176,     11,      0,         1,               0.804
   176,      0,     11,         0,               0.774
   176,      0,     11,         1,               0.774
   176,     11,     11,         0,               0.814
   176,     11,     11,         1,               0.814
   176,   2048,      0,         0,               0.796
   176,   2048,      0,         1,               0.796
   176,   2059,      0,         0,               0.804
   176,   2059,      0,         1,               0.804
   176,   2048,     11,         0,               0.774
   176,   2048,     11,         1,               0.774
   176,   2059,     11,         0,               0.814
   176,   2059,     11,         1,               0.814
   192,      0,      0,         0,               0.778
   192,      0,      0,         1,               0.778
   192,     12,      0,         0,               0.881
   192,     12,      0,         1,               0.881
   192,      0,     12,         0,               1.167
   192,      0,     12,         1,               1.167
   192,     12,     12,         0,               0.841
   192,     12,     12,         1,               0.841
   192,   2048,      0,         0,               0.778
   192,   2048,      0,         1,               0.778
   192,   2060,      0,         0,               0.881
   192,   2060,      0,         1,               0.881
   192,   2048,     12,         0,               0.889
   192,   2048,     12,         1,               0.889
   192,   2060,     12,         0,               0.906
   192,   2060,     12,         1,               0.906
   208,      0,      0,         0,               0.833
   208,      0,      0,         1,               0.833
   208,     13,      0,         0,               0.921
   208,     13,      0,         1,               0.921
   208,      0,     13,         0,               0.835
   208,      0,     13,         1,               0.833
   208,     13,     13,         0,               1.333
   208,     13,     13,         1,               1.333
   208,   2048,      0,         0,               0.833
   208,   2048,      0,         1,               0.833
   208,   2061,      0,         0,               0.921
   208,   2061,      0,         1,               0.921
   208,   2048,     13,         0,               0.833
   208,   2048,     13,         1,               0.833
   208,   2061,     13,         0,               1.333
   208,   2061,     13,         1,               1.333
   224,      0,      0,         0,                0.93
   224,      0,      0,         1,                0.93
   224,     14,      0,         0,                 1.0
   224,     14,      0,         1,                 1.0
   224,      0,     14,         0,                1.15
   224,      0,     14,         1,                1.15
   224,     14,     14,         0,               1.452
   224,     14,     14,         1,               1.452
   224,   2048,      0,         0,                0.93
   224,   2048,      0,         1,                0.93
   224,   2062,      0,         0,                 1.0
   224,   2062,      0,         1,                 1.0
   224,   2048,     14,         0,               0.833
   224,   2048,     14,         1,               0.833
   224,   2062,     14,         0,               1.452
   224,   2062,     14,         1,               1.452
   240,      0,      0,         0,               0.909
   240,      0,      0,         1,               0.909
   240,     15,      0,         0,               0.797
   240,     15,      0,         1,               0.797
   240,      0,     15,         0,               0.771
   240,      0,     15,         1,               0.771
   240,     15,     15,         0,                0.93
   240,     15,     15,         1,                0.93
   240,   2048,      0,         0,               0.909
   240,   2048,      0,         1,               0.909
   240,   2063,      0,         0,               0.797
   240,   2063,      0,         1,               0.797
   240,   2048,     15,         0,               0.771
   240,   2048,     15,         1,               0.771
   240,   2063,     15,         0,                0.93
   240,   2063,     15,         1,                0.93
   272,      0,      0,         0,                 0.9
   272,      0,      0,         1,                 0.9
   272,     17,      0,         0,               1.015
   272,     17,      0,         1,               1.015
   272,      0,     17,         0,               0.926
   272,      0,     17,         1,               0.927
   272,     17,     17,         0,               0.892
   272,     17,     17,         1,               0.892
   272,   2048,      0,         0,                 0.9
   272,   2048,      0,         1,                 0.9
   272,   2065,      0,         0,               1.015
   272,   2065,      0,         1,               1.015
   272,   2048,     17,         0,               0.927
   272,   2048,     17,         1,               0.927
   272,   2065,     17,         0,               0.878
   272,   2065,     17,         1,               0.878
   288,      0,      0,         0,               0.882
   288,      0,      0,         1,               0.882
   288,     18,      0,         0,               0.803
   288,     18,      0,         1,               0.803
   288,      0,     18,         0,               0.768
   288,      0,     18,         1,               0.768
   288,     18,     18,         0,               0.882
   288,     18,     18,         1,               0.882
   288,   2048,      0,         0,               0.882
   288,   2048,      0,         1,               0.882
   288,   2066,      0,         0,               0.803
   288,   2066,      0,         1,               0.803
   288,   2048,     18,         0,               0.768
   288,   2048,     18,         1,               0.768
   288,   2066,     18,         0,               0.882
   288,   2066,     18,         1,               0.882
   304,      0,      0,         0,               0.865
   304,      0,      0,         1,               0.865
   304,     19,      0,         0,               0.944
   304,     19,      0,         1,               0.944
   304,      0,     19,         0,               0.943
   304,      0,     19,         1,               0.943
   304,     19,     19,         0,               0.956
   304,     19,     19,         1,               0.956
   304,   2048,      0,         0,               0.866
   304,   2048,      0,         1,               0.865
   304,   2067,      0,         0,               0.944
   304,   2067,      0,         1,               0.944
   304,   2048,     19,         0,               0.943
   304,   2048,     19,         1,               0.943
   304,   2067,     19,         0,               0.947
   304,   2067,     19,         1,               0.947
   320,      0,      0,         0,               0.944
   320,      0,      0,         1,               0.944
   320,     20,      0,         0,               0.962
   320,     20,      0,         1,               0.962
   320,      0,     20,         0,               1.214
   320,      0,     20,         1,               1.214
   320,     20,     20,         0,               1.365
   320,     20,     20,         1,               1.365
   320,   2048,      0,         0,               0.943
   320,   2048,      0,         1,               0.943
   320,   2068,      0,         0,               0.962
   320,   2068,      0,         1,               0.962
   320,   2048,     20,         0,               0.914
   320,   2048,     20,         1,               0.914
   320,   2068,     20,         0,               1.365
   320,   2068,     20,         1,               1.365
   336,      0,      0,         0,                 1.0
   336,      0,      0,         1,                 1.0
   336,     21,      0,         0,               0.986
   336,     21,      0,         1,               0.986
   336,      0,     21,         0,               0.853
   336,      0,     21,         1,               0.853
   336,     21,     21,         0,               0.843
   336,     21,     21,         1,               0.843
   336,   2048,      0,         0,                 1.0
   336,   2048,      0,         1,                 1.0
   336,   2069,      0,         0,               0.986
   336,   2069,      0,         1,               0.986
   336,   2048,     21,         0,               0.853
   336,   2048,     21,         1,               0.853
   336,   2069,     21,         0,               0.831
   336,   2069,     21,         1,               0.831
   352,      0,      0,         0,                0.98
   352,      0,      0,         1,                0.98
   352,     22,      0,         0,               0.811
   352,     22,      0,         1,               0.811
   352,      0,     22,         0,               0.882
   352,      0,     22,         1,               0.882
   352,     22,     22,         0,                 1.1
   352,     22,     22,         1,                 1.1
   352,   2048,      0,         0,                0.98
   352,   2048,      0,         1,                0.98
   352,   2070,      0,         0,               0.811
   352,   2070,      0,         1,               0.811
   352,   2048,     22,         0,               0.882
   352,   2048,     22,         1,               0.882
   352,   2070,     22,         0,                 1.1
   352,   2070,     22,         1,                 1.1
   368,      0,      0,         0,               1.058
   368,      0,      0,         1,               1.058
   368,     23,      0,         0,                 1.0
   368,     23,      0,         1,                 1.0
   368,      0,     23,         0,               0.948
   368,      0,     23,         1,               0.948
   368,     23,     23,         0,               0.723
   368,     23,     23,         1,               0.723
   368,   2048,      0,         0,               1.058
   368,   2048,      0,         1,               1.058
   368,   2071,      0,         0,                 1.0
   368,   2071,      0,         1,                 1.0
   368,   2048,     23,         0,               0.948
   368,   2048,     23,         1,               0.948
   368,   2071,     23,         0,               0.701
   368,   2071,     23,         1,               0.701
   384,      0,      0,         0,               1.012
   384,      0,      0,         1,               1.012
   384,     24,      0,         0,                1.04
   384,     24,      0,         1,                1.04
   384,      0,     24,         0,               1.154
   384,      0,     24,         1,               1.154
   384,     24,     24,         0,               1.423
   384,     24,     24,         1,               1.423
   384,   2048,      0,         0,               1.012
   384,   2048,      0,         1,               1.012
   384,   2072,      0,         0,                1.04
   384,   2072,      0,         1,                1.04
   384,   2048,     24,         0,                0.91
   384,   2048,     24,         1,                0.91
   384,   2072,     24,         0,               1.423
   384,   2072,     24,         1,               1.423
   400,      0,      0,         0,               0.948
   400,      0,      0,         1,               0.948
   400,     25,      0,         0,               0.957
   400,     25,      0,         1,               0.957
   400,      0,     25,         0,               1.099
   400,      0,     25,         1,               1.069
   400,     25,     25,         0,               0.885
   400,     25,     25,         1,               0.885
   400,   2048,      0,         0,               0.948
   400,   2048,      0,         1,               0.948
   400,   2073,      0,         0,               0.957
   400,   2073,      0,         1,               0.957
   400,   2048,     25,         0,                0.94
   400,   2048,     25,         1,                0.94
   400,   2073,     25,         0,               0.908
   400,   2073,     25,         1,               0.908
   416,      0,      0,         0,               1.017
   416,      0,      0,         1,               1.017
   416,     26,      0,         0,               0.903
   416,     26,      0,         1,               0.903
   416,      0,     26,         0,               0.881
   416,      0,     26,         1,               0.881
   416,     26,     26,         0,               1.035
   416,     26,     26,         1,               1.035
   416,   2048,      0,         0,               1.017
   416,   2048,      0,         1,               1.017
   416,   2074,      0,         0,               0.903
   416,   2074,      0,         1,               0.903
   416,   2048,     26,         0,               0.881
   416,   2048,     26,         1,               0.881
   416,   2074,     26,         0,               1.034
   416,   2074,     26,         1,               1.035
   432,      0,      0,         0,                 1.0
   432,      0,      0,         1,                 1.0
   432,     27,      0,         0,               0.933
   432,     27,      0,         1,               0.933
   432,      0,     27,         0,               0.941
   432,      0,     27,         1,               0.941
   432,     27,     27,         0,               0.953
   432,     27,     27,         1,               0.954
   432,   2048,      0,         0,                 1.0
   432,   2048,      0,         1,                 1.0
   432,   2075,      0,         0,               0.933
   432,   2075,      0,         1,               0.933
   432,   2048,     27,         0,               0.941
   432,   2048,     27,         1,               0.941
   432,   2075,     27,         0,                0.93
   432,   2075,     27,         1,                0.93
   448,      0,      0,         0,               0.984
   448,      0,      0,         1,               0.984
   448,     28,      0,         0,               0.896
   448,     28,      0,         1,               0.896
   448,      0,     28,         0,               1.244
   448,      0,     28,         1,               1.244
   448,     28,     28,         0,               1.333
   448,     28,     28,         1,               1.333
   448,   2048,      0,         0,               0.984
   448,   2048,      0,         1,               0.984
   448,   2076,      0,         0,               0.896
   448,   2076,      0,         1,               0.896
   448,   2048,     28,         0,               0.988
   448,   2048,     28,         1,               0.988
   448,   2076,     28,         0,               1.333
   448,   2076,     28,         1,               1.333
   464,      0,      0,         0,               1.083
   464,      0,      0,         1,               1.083
   464,     29,      0,         0,               0.978
   464,     29,      0,         1,               0.978
   464,      0,     29,         0,               0.924
   464,      0,     29,         1,               0.924
   464,     29,     29,         0,               0.901
   464,     29,     29,         1,               0.901
   464,   2048,      0,         0,               1.083
   464,   2048,      0,         1,               1.083
   464,   2077,      0,         0,               0.978
   464,   2077,      0,         1,               0.978
   464,   2048,     29,         0,               0.924
   464,   2048,     29,         1,               0.924
   464,   2077,     29,         0,                0.89
   464,   2077,     29,         1,                0.89
   480,      0,      0,         0,               1.066
   480,      0,      0,         1,               1.066
   480,     30,      0,         0,                 0.9
   480,     30,      0,         1,                 0.9
   480,      0,     30,         0,                0.88
   480,      0,     30,         1,                0.88
   480,     30,     30,         0,               1.083
   480,     30,     30,         1,               1.083
   480,   2048,      0,         0,               1.066
   480,   2048,      0,         1,               1.066
   480,   2078,      0,         0,                 0.9
   480,   2078,      0,         1,                 0.9
   480,   2048,     30,         0,                0.88
   480,   2048,     30,         1,                0.88
   480,   2078,     30,         0,               1.083
   480,   2078,     30,         1,               1.083
   496,      0,      0,         0,               1.032
   496,      0,      0,         1,               1.032
   496,     31,      0,         0,                0.95
   496,     31,      0,         1,                0.95
   496,      0,     31,         0,               1.011
   496,      0,     31,         1,               1.011
   496,     31,     31,         0,               0.973
   496,     31,     31,         1,               0.973
   496,   2048,      0,         0,               1.032
   496,   2048,      0,         1,               1.032
   496,   2079,      0,         0,                0.95
   496,   2079,      0,         1,                0.95
   496,   2048,     31,         0,               1.011
   496,   2048,     31,         1,               1.011
   496,   2079,     31,         0,               0.941
   496,   2079,     31,         1,               0.941
  1024,     32,      0,         0,               1.143
  1024,     32,      0,         1,               1.143
  1024,      0,     32,         0,               1.143
  1024,      0,     32,         1,               1.143
  1024,     32,     32,         0,               1.143
  1024,     32,     32,         1,               1.143
  1024,   2080,      0,         0,               1.143
  1024,   2080,      0,         1,               1.143
  1024,   2048,     32,         0,               1.143
  1024,   2048,     32,         1,               1.143
  1024,   2080,     32,         0,               1.143
  1024,   2080,     32,         1,               1.143
  1056,      0,      0,         0,               1.168
  1056,      0,      0,         1,               1.168
  1056,     33,      0,         0,               1.067
  1056,     33,      0,         1,               1.067
  1056,      0,     33,         0,               0.977
  1056,      0,     33,         1,               0.977
  1056,     33,     33,         0,               1.043
  1056,     33,     33,         1,               1.043
  1056,   2048,      0,         0,               1.168
  1056,   2048,      0,         1,               1.168
  1056,   2081,      0,         0,               1.067
  1056,   2081,      0,         1,               1.067
  1056,   2048,     33,         0,               0.977
  1056,   2048,     33,         1,               0.977
  1056,   2081,     33,         0,                 1.0
  1056,   2081,     33,         1,                 1.0
  1088,      0,      0,         0,               1.171
  1088,      0,      0,         1,               1.171
  1088,     34,      0,         0,               1.041
  1088,     34,      0,         1,               1.041
  1088,      0,     34,         0,               1.079
  1088,      0,     34,         1,               1.079
  1088,     34,     34,         0,               0.966
  1088,     34,     34,         1,               0.966
  1088,   2048,      0,         0,               1.171
  1088,   2048,      0,         1,               1.171
  1088,   2082,      0,         0,               1.041
  1088,   2082,      0,         1,               1.041
  1088,   2048,     34,         0,               0.994
  1088,   2048,     34,         1,               0.994
  1088,   2082,     34,         0,               0.966
  1088,   2082,     34,         1,               0.966
  1120,      0,      0,         0,               1.152
  1120,      0,      0,         1,               1.153
  1120,     35,      0,         0,               1.051
  1120,     35,      0,         1,               1.051
  1120,      0,     35,         0,                 1.0
  1120,      0,     35,         1,                 1.0
  1120,     35,     35,         0,               1.068
  1120,     35,     35,         1,               1.068
  1120,   2048,      0,         0,               1.151
  1120,   2048,      0,         1,               1.151
  1120,   2083,      0,         0,               1.051
  1120,   2083,      0,         1,               1.051
  1120,   2048,     35,         0,                 1.0
  1120,   2048,     35,         1,                 1.0
  1120,   2083,     35,         0,               1.027
  1120,   2083,     35,         1,               1.027
  1152,      0,      0,         0,               1.159
  1152,      0,      0,         1,               1.159
  1152,     36,      0,         0,               1.034
  1152,     36,      0,         1,               1.034
  1152,      0,     36,         0,                1.07
  1152,      0,     36,         1,                1.07
  1152,     36,     36,         0,               0.967
  1152,     36,     36,         1,               0.967
  1152,   2048,      0,         0,               1.159
  1152,   2048,      0,         1,               1.159
  1152,   2084,      0,         0,               1.034
  1152,   2084,      0,         1,               1.034
  1152,   2048,     36,         0,               0.984
  1152,   2048,     36,         1,               0.984
  1152,   2084,     36,         0,               0.967
  1152,   2084,     36,         1,               0.967
  1184,      0,      0,         0,               1.157
  1184,      0,      0,         1,               1.157
  1184,     37,      0,         0,               1.067
  1184,     37,      0,         1,               1.066
  1184,      0,     37,         0,               0.993
  1184,      0,     37,         1,               0.993
  1184,     37,     37,         0,                1.08
  1184,     37,     37,         1,               1.081
  1184,   2048,      0,         0,               1.157
  1184,   2048,      0,         1,               1.157
  1184,   2085,      0,         0,               1.066
  1184,   2085,      0,         1,               1.066
  1184,   2048,     37,         0,               0.993
  1184,   2048,     37,         1,               0.993
  1184,   2085,     37,         0,                1.04
  1184,   2085,     37,         1,                1.04
  1216,      0,      0,         0,               1.139
  1216,      0,      0,         1,               1.139
  1216,     38,      0,         0,               1.024
  1216,     38,      0,         1,               1.024
  1216,      0,     38,         0,               1.087
  1216,      0,     38,         1,               1.087
  1216,     38,     38,         0,                 1.0
  1216,     38,     38,         1,                 1.0
  1216,   2048,      0,         0,               1.138
  1216,   2048,      0,         1,               1.138
  1216,   2086,      0,         0,               1.024
  1216,   2086,      0,         1,               1.024
  1216,   2048,     38,         0,                1.01
  1216,   2048,     38,         1,                1.01
  1216,   2086,     38,         0,                 1.0
  1216,   2086,     38,         1,                 1.0
  1248,      0,      0,         0,               1.176
  1248,      0,      0,         1,               1.174
  1248,     39,      0,         0,               1.074
  1248,     39,      0,         1,               1.074
  1248,      0,     39,         0,               0.966
  1248,      0,     39,         1,               0.985
  1248,     39,     39,         0,               1.064
  1248,     39,     39,         1,               1.064
  1248,   2048,      0,         0,               1.179
  1248,   2048,      0,         1,               1.179
  1248,   2087,      0,         0,               1.074
  1248,   2087,      0,         1,               1.074
  1248,   2048,     39,         0,               0.985
  1248,   2048,     39,         1,               0.985
  1248,   2087,     39,         0,               1.026
  1248,   2087,     39,         1,               1.026
  1280,      0,      0,         0,               0.993
  1280,      0,      0,         1,               0.993
  1280,     40,      0,         0,               1.051
  1280,     40,      0,         1,               1.051
  1280,      0,     40,         0,               1.044
  1280,      0,     40,         1,               1.045
  1280,     40,     40,         0,                1.25
  1280,     40,     40,         1,                1.25
  1280,   2048,      0,         0,               0.992
  1280,   2048,      0,         1,               0.992
  1280,   2088,      0,         0,               1.051
  1280,   2088,      0,         1,               1.051
  1280,   2048,     40,         0,               0.946
  1280,   2048,     40,         1,               0.946
  1280,   2088,     40,         0,               1.252
  1280,   2088,     40,         1,               1.252
  1312,      0,      0,         0,               0.969
  1312,      0,      0,         1,               0.969
  1312,     41,      0,         0,               0.991
  1312,     41,      0,         1,               0.991
  1312,      0,     41,         0,               0.837
  1312,      0,     41,         1,               0.837
  1312,     41,     41,         0,               1.025
  1312,     41,     41,         1,               1.025
  1312,   2048,      0,         0,               0.969
  1312,   2048,      0,         1,               0.969
  1312,   2089,      0,         0,               0.991
  1312,   2089,      0,         1,                0.99
  1312,   2048,     41,         0,               0.837
  1312,   2048,     41,         1,               0.837
  1312,   2089,     41,         0,               0.975
  1312,   2089,     41,         1,               0.975
  1344,      0,      0,         0,               0.988
  1344,      0,      0,         1,               0.988
  1344,     42,      0,         0,               1.031
  1344,     42,      0,         1,               1.031
  1344,      0,     42,         0,               1.033
  1344,      0,     42,         1,               1.033
  1344,     42,     42,         0,               0.982
  1344,     42,     42,         1,               0.982
  1344,   2048,      0,         0,               0.992
  1344,   2048,      0,         1,               0.992
  1344,   2090,      0,         0,               1.031
  1344,   2090,      0,         1,               1.031
  1344,   2048,     42,         0,               0.943
  1344,   2048,     42,         1,               0.942
  1344,   2090,     42,         0,               0.982
  1344,   2090,     42,         1,               0.982
  1376,      0,      0,         0,               1.016
  1376,      0,      0,         1,               1.016
  1376,     43,      0,         0,                1.01
  1376,     43,      0,         1,                1.01
  1376,      0,     43,         0,               0.829
  1376,      0,     43,         1,               0.829
  1376,     43,     43,         0,               1.024
  1376,     43,     43,         1,               1.024
  1376,   2048,      0,         0,               1.006
  1376,   2048,      0,         1,               1.015
  1376,   2091,      0,         0,                1.01
  1376,   2091,      0,         1,                1.01
  1376,   2048,     43,         0,               0.829
  1376,   2048,     43,         1,               0.829
  1376,   2091,     43,         0,                0.98
  1376,   2091,     43,         1,                0.98
  1408,      0,      0,         0,               0.987
  1408,      0,      0,         1,               0.987
  1408,     44,      0,         0,               1.015
  1408,     44,      0,         1,               1.015
  1408,      0,     44,         0,               1.018
  1408,      0,     44,         1,               1.014
  1408,     44,     44,         0,               1.004
  1408,     44,     44,         1,               0.994
  1408,   2048,      0,         0,               0.988
  1408,   2048,      0,         1,               0.988
  1408,   2092,      0,         0,               1.015
  1408,   2092,      0,         1,               1.015
  1408,   2048,     44,         0,               0.955
  1408,   2048,     44,         1,               0.955
  1408,   2092,     44,         0,                 1.0
  1408,   2092,     44,         1,               0.994
  1440,      0,      0,         0,               0.986
  1440,      0,      0,         1,               0.986
  1440,     45,      0,         0,               1.013
  1440,     45,      0,         1,               1.013
  1440,      0,     45,         0,               0.814
  1440,      0,     45,         1,               0.814
  1440,     45,     45,         0,               1.006
  1440,     45,     45,         1,               1.006
  1440,   2048,      0,         0,               0.986
  1440,   2048,      0,         1,               0.986
  1440,   2093,      0,         0,               1.013
  1440,   2093,      0,         1,               1.013
  1440,   2048,     45,         0,               0.814
  1440,   2048,     45,         1,               0.814
  1440,   2093,     45,         0,               0.966
  1440,   2093,     45,         1,               0.966
  1472,      0,      0,         0,               0.997
  1472,      0,      0,         1,               0.994
  1472,     46,      0,         0,               1.045
  1472,     46,      0,         1,               1.045
  1472,      0,     46,         0,               1.026
  1472,      0,     46,         1,               1.026
  1472,     46,     46,         0,               0.966
  1472,     46,     46,         1,               0.966
  1472,   2048,      0,         0,                 1.0
  1472,   2048,      0,         1,               0.996
  1472,   2094,      0,         0,               1.045
  1472,   2094,      0,         1,               1.045
  1472,   2048,     46,         0,               0.939
  1472,   2048,     46,         1,               0.939
  1472,   2094,     46,         0,               0.966
  1472,   2094,     46,         1,               0.966
  1504,      0,      0,         0,               0.993
  1504,      0,      0,         1,               0.993
  1504,     47,      0,         0,               0.999
  1504,     47,      0,         1,               0.999
  1504,      0,     47,         0,               0.826
  1504,      0,     47,         1,               0.826
  1504,     47,     47,         0,               1.023
  1504,     47,     47,         1,               1.023
  1504,   2048,      0,         0,               0.993
  1504,   2048,      0,         1,               0.993
  1504,   2095,      0,         0,               0.999
  1504,   2095,      0,         1,               0.999
  1504,   2048,     47,         0,               0.826
  1504,   2048,     47,         1,               0.826
  1504,   2095,     47,         0,               0.993
  1504,   2095,     47,         1,               0.993
  1536,      0,      0,         0,               0.992
  1536,      0,      0,         1,               0.991
  1536,     48,      0,         0,               1.019
  1536,     48,      0,         1,               1.019
  1536,      0,     48,         0,               1.025
  1536,      0,     48,         1,               1.024
  1536,     48,     48,         0,               0.994
  1536,     48,     48,         1,               0.994
  1536,   2048,      0,         0,               0.994
  1536,   2048,      0,         1,               0.994
  1536,   2096,      0,         0,               1.019
  1536,   2096,      0,         1,               1.019
  1536,   2048,     48,         0,               1.025
  1536,   2048,     48,         1,               1.025
  1536,   2096,     48,         0,               0.994
  1536,   2096,     48,         1,               0.994
  1568,      0,      0,         0,               0.994
  1568,      0,      0,         1,               0.994
  1568,     49,      0,         0,               0.903
  1568,     49,      0,         1,               0.903
  1568,      0,     49,         0,               1.144
  1568,      0,     49,         1,               1.144
  1568,     49,     49,         0,               1.461
  1568,     49,     49,         1,               1.461
  1568,   2048,      0,         0,               0.993
  1568,   2048,      0,         1,               0.993
  1568,   2097,      0,         0,               0.903
  1568,   2097,      0,         1,               0.903
  1568,   2048,     49,         0,                1.09
  1568,   2048,     49,         1,                1.09
  1568,   2097,     49,         0,                1.46
  1568,   2097,     49,         1,                1.46
  1600,      0,      0,         0,               0.981
  1600,      0,      0,         1,               0.981
  1600,     50,      0,         0,               1.022
  1600,     50,      0,         1,               1.022
  1600,      0,     50,         0,               1.017
  1600,      0,     50,         1,               1.017
  1600,     50,     50,         0,               0.973
  1600,     50,     50,         1,               0.973
  1600,   2048,      0,         0,               0.981
  1600,   2048,      0,         1,               0.981
  1600,   2098,      0,         0,               1.022
  1600,   2098,      0,         1,               1.022
  1600,   2048,     50,         0,               0.961
  1600,   2048,     50,         1,               0.961
  1600,   2098,     50,         0,               0.973
  1600,   2098,     50,         1,               0.973
  1632,      0,      0,         0,               1.019
  1632,      0,      0,         1,               1.019
  1632,     51,      0,         0,               0.893
  1632,     51,      0,         1,               0.893
  1632,      0,     51,         0,               1.131
  1632,      0,     51,         1,               1.131
  1632,     51,     51,         0,               1.444
  1632,     51,     51,         1,               1.444
  1632,   2048,      0,         0,               1.019
  1632,   2048,      0,         1,               1.019
  1632,   2099,      0,         0,               0.893
  1632,   2099,      0,         1,               0.893
  1632,   2048,     51,         0,               1.079
  1632,   2048,     51,         1,               1.079
  1632,   2099,     51,         0,               1.449
  1632,   2099,     51,         1,               1.449
  1664,      0,      0,         0,               1.005
  1664,      0,      0,         1,               1.004
  1664,     52,      0,         0,               0.986
  1664,     52,      0,         1,               0.986
  1664,      0,     52,         0,               1.004
  1664,      0,     52,         1,               1.004
  1664,     52,     52,         0,               0.976
  1664,     52,     52,         1,               0.976
  1664,   2048,      0,         0,               1.006
  1664,   2048,      0,         1,               1.006
  1664,   2100,      0,         0,               0.993
  1664,   2100,      0,         1,               0.993
  1664,   2048,     52,         0,               0.946
  1664,   2048,     52,         1,               0.946
  1664,   2100,     52,         0,               0.976
  1664,   2100,     52,         1,               0.976
  1696,      0,      0,         0,               0.994
  1696,      0,      0,         1,               0.992
  1696,     53,      0,         0,               0.884
  1696,     53,      0,         1,               0.884
  1696,      0,     53,         0,               1.141
  1696,      0,     53,         1,               1.141
  1696,     53,     53,         0,                1.43
  1696,     53,     53,         1,                1.43
  1696,   2048,      0,         0,               0.994
  1696,   2048,      0,         1,               0.994
  1696,   2101,      0,         0,               0.884
  1696,   2101,      0,         1,               0.884
  1696,   2048,     53,         0,               1.088
  1696,   2048,     53,         1,               1.088
  1696,   2101,     53,         0,               1.429
  1696,   2101,     53,         1,               1.429
  1728,      0,      0,         0,               0.978
  1728,      0,      0,         1,               0.978
  1728,     54,      0,         0,               1.031
  1728,     54,      0,         1,               1.033
  1728,      0,     54,         0,                 1.0
  1728,      0,     54,         1,                 1.0
  1728,     54,     54,         0,                0.96
  1728,     54,     54,         1,                0.96
  1728,   2048,      0,         0,               0.976
  1728,   2048,      0,         1,               0.976
  1728,   2102,      0,         0,               1.033
  1728,   2102,      0,         1,               1.033
  1728,   2048,     54,         0,               0.947
  1728,   2048,     54,         1,               0.947
  1728,   2102,     54,         0,                0.96
  1728,   2102,     54,         1,                0.96
  1760,      0,      0,         0,               1.019
  1760,      0,      0,         1,               1.021
  1760,     55,      0,         0,                 0.9
  1760,     55,      0,         1,                 0.9
  1760,      0,     55,         0,               1.125
  1760,      0,     55,         1,               1.125
  1760,     55,     55,         0,               1.437
  1760,     55,     55,         1,               1.436
  1760,   2048,      0,         0,               1.016
  1760,   2048,      0,         1,               1.015
  1760,   2103,      0,         0,                 0.9
  1760,   2103,      0,         1,                 0.9
  1760,   2048,     55,         0,               1.073
  1760,   2048,     55,         1,               1.074
  1760,   2103,     55,         0,                1.44
  1760,   2103,     55,         1,                1.44
  1792,      0,      0,         0,               1.002
  1792,      0,      0,         1,               1.002
  1792,     56,      0,         0,               1.028
  1792,     56,      0,         1,               1.028
  1792,      0,     56,         0,               1.014
  1792,      0,     56,         1,               1.015
  1792,     56,     56,         0,               1.191
  1792,     56,     56,         1,               1.191
  1792,   2048,      0,         0,               1.003
  1792,   2048,      0,         1,               1.003
  1792,   2104,      0,         0,               1.028
  1792,   2104,      0,         1,               1.028
  1792,   2048,     56,         0,               0.963
  1792,   2048,     56,         1,               0.963
  1792,   2104,     56,         0,               1.191
  1792,   2104,     56,         1,               1.191
  1824,      0,      0,         0,               0.999
  1824,      0,      0,         1,                 1.0
  1824,     57,      0,         0,               0.891
  1824,     57,      0,         1,               0.891
  1824,      0,     57,         0,               1.114
  1824,      0,     57,         1,               1.114
  1824,     57,     57,         0,               1.407
  1824,     57,     57,         1,               1.407
  1824,   2048,      0,         0,               1.001
  1824,   2048,      0,         1,               1.001
  1824,   2105,      0,         0,               0.891
  1824,   2105,      0,         1,               0.891
  1824,   2048,     57,         0,               1.064
  1824,   2048,     57,         1,               1.064
  1824,   2105,     57,         0,               1.407
  1824,   2105,     57,         1,               1.407
  1856,      0,      0,         0,               0.989
  1856,      0,      0,         1,               0.987
  1856,     58,      0,         0,               1.042
  1856,     58,      0,         1,               1.042
  1856,      0,     58,         0,               1.007
  1856,      0,     58,         1,               1.007
  1856,     58,     58,         0,               0.978
  1856,     58,     58,         1,               0.972
  1856,   2048,      0,         0,               0.992
  1856,   2048,      0,         1,               0.992
  1856,   2106,      0,         0,               1.042
  1856,   2106,      0,         1,               1.042
  1856,   2048,     58,         0,               0.954
  1856,   2048,     58,         1,               0.954
  1856,   2106,     58,         0,               0.979
  1856,   2106,     58,         1,               0.972
  1888,      0,      0,         0,               0.994
  1888,      0,      0,         1,               0.994
  1888,     59,      0,         0,               0.883
  1888,     59,      0,         1,               0.883
  1888,      0,     59,         0,               1.121
  1888,      0,     59,         1,               1.123
  1888,     59,     59,         0,               1.413
  1888,     59,     59,         1,               1.413
  1888,   2048,      0,         0,               0.985
  1888,   2048,      0,         1,               0.994
  1888,   2107,      0,         0,               0.883
  1888,   2107,      0,         1,               0.883
  1888,   2048,     59,         0,               1.076
  1888,   2048,     59,         1,               1.076
  1888,   2107,     59,         0,               1.413
  1888,   2107,     59,         1,               1.413
  1920,      0,      0,         0,                 1.0
  1920,      0,      0,         1,               0.999
  1920,     60,      0,         0,               1.033
  1920,     60,      0,         1,               1.033
  1920,      0,     60,         0,               0.996
  1920,      0,     60,         1,               0.997
  1920,     60,     60,         0,               0.968
  1920,     60,     60,         1,               0.968
  1920,   2048,      0,         0,                 1.0
  1920,   2048,      0,         1,                 1.0
  1920,   2108,      0,         0,               1.034
  1920,   2108,      0,         1,               1.034
  1920,   2048,     60,         0,               0.949
  1920,   2048,     60,         1,               0.949
  1920,   2108,     60,         0,               0.968
  1920,   2108,     60,         1,               0.968
  1952,      0,      0,         0,               1.004
  1952,      0,      0,         1,               1.004
  1952,     61,      0,         0,               0.898
  1952,     61,      0,         1,               0.898
  1952,      0,     61,         0,               1.118
  1952,      0,     61,         1,               1.118
  1952,     61,     61,         0,               1.387
  1952,     61,     61,         1,               1.387
  1952,   2048,      0,         0,               1.004
  1952,   2048,      0,         1,               1.004
  1952,   2109,      0,         0,               0.898
  1952,   2109,      0,         1,               0.898
  1952,   2048,     61,         0,               1.071
  1952,   2048,     61,         1,               1.071
  1952,   2109,     61,         0,               1.387
  1952,   2109,     61,         1,               1.387
  1984,      0,      0,         0,               0.993
  1984,      0,      0,         1,               0.993
  1984,     62,      0,         0,               1.025
  1984,     62,      0,         1,               1.025
  1984,      0,     62,         0,               1.005
  1984,      0,     62,         1,               1.007
  1984,     62,     62,         0,               0.982
  1984,     62,     62,         1,               0.982
  1984,   2048,      0,         0,               0.993
  1984,   2048,      0,         1,               0.993
  1984,   2110,      0,         0,               1.025
  1984,   2110,      0,         1,               1.025
  1984,   2048,     62,         0,                0.96
  1984,   2048,     62,         1,                0.96
  1984,   2110,     62,         0,               0.982
  1984,   2110,     62,         1,               0.982
  2016,      0,      0,         0,                 1.0
  2016,      0,      0,         1,               0.999
  2016,     63,      0,         0,               0.889
  2016,     63,      0,         1,                0.89
  2016,      0,     63,         0,               1.091
  2016,      0,     63,         1,               1.092
  2016,     63,     63,         0,               1.362
  2016,     63,     63,         1,               1.363
  2016,   2048,      0,         0,                 1.0
  2016,   2048,      0,         1,                 1.0
  2016,   2111,      0,         0,               0.965
  2016,   2111,      0,         1,               0.965
  2016,   2048,     63,         0,               1.049
  2016,   2048,     63,         1,               1.049
  2016,   2111,     63,         0,               1.405
  2016,   2111,     63,         1,               1.405
  2048,     32,      0,         0,                1.01
  2048,     32,      0,         1,                1.01
  2048,      0,     32,         0,               1.005
  2048,      0,     32,         1,               1.005
  2048,     32,     32,         0,               1.005
  2048,     32,     32,         1,               1.005
  2048,      0,      1,         0,               0.983
  2048,      0,      1,         1,               0.984
  2048,      1,      0,         0,               1.039
  2048,      1,      0,         1,               1.039
  2048,     32,      1,         0,               1.063
  2048,     32,      1,         1,               1.063
  2048,      1,     32,         0,                0.94
  2048,      1,     32,         1,                0.94
  2048,   2048,      1,         0,               0.981
  2048,   2048,      1,         1,               0.981
  2048,   2049,      0,         0,               0.904
  2048,   2049,      0,         1,               0.904
  2112,      0,      0,         0,               0.996
  2112,      0,      0,         1,               0.995
  2112,      1,      0,         0,               1.031
  2112,      1,      0,         1,               1.031
  2112,     33,      0,         0,                1.01
  2112,     33,      0,         1,                1.01
  2112,      0,      1,         0,               0.972
  2112,      0,      1,         1,               0.972
  2112,      0,     33,         0,               0.987
  2112,      0,     33,         1,               0.987
  2112,      1,      1,         0,               0.914
  2112,      1,      1,         1,               0.914
  2112,     33,     33,         0,               0.983
  2112,     33,     33,         1,               0.983
  2112,   2048,      0,         0,               0.994
  2112,   2048,      0,         1,                0.99
  2112,   2049,      0,         0,               1.031
  2112,   2049,      0,         1,               1.031
  2112,   2048,      1,         0,               0.955
  2112,   2048,      1,         1,               0.955
  2112,   2049,      1,         0,               0.906
  2112,   2049,      1,         1,               0.906
  2112,     33,      1,         0,               1.163
  2112,     33,      1,         1,               1.164
  2112,      1,     33,         0,               1.046
  2112,      1,     33,         1,               1.046
  2176,      0,      0,         0,               0.984
  2176,      0,      0,         1,               0.985
  2176,      2,      0,         0,               1.023
  2176,      2,      0,         1,               1.023
  2176,     34,      0,         0,                 1.0
  2176,     34,      0,         1,                 1.0
  2176,      0,      2,         0,               0.985
  2176,      0,      2,         1,               0.985
  2176,      0,     34,         0,               0.995
  2176,      0,     34,         1,               0.982
  2176,      2,      2,         0,               0.928
  2176,      2,      2,         1,               0.928
  2176,     34,     34,         0,               1.004
  2176,     34,     34,         1,               1.004
  2176,   2048,      0,         0,               0.985
  2176,   2048,      0,         1,               0.986
  2176,   2050,      0,         0,               1.023
  2176,   2050,      0,         1,               1.023
  2176,   2048,      2,         0,               0.802
  2176,   2048,      2,         1,               0.802
  2176,   2050,      2,         0,               0.894
  2176,   2050,      2,         1,               0.894
  2176,      2,      1,         0,               1.068
  2176,      2,      1,         1,               1.068
  2176,      1,      2,         0,               0.976
  2176,      1,      2,         1,               0.976
  2176,     34,      1,         0,               1.077
  2176,     34,      1,         1,               1.077
  2176,      1,     34,         0,               0.978
  2176,      1,     34,         1,               0.978
  2176,   2050,      1,         0,               1.061
  2176,   2050,      1,         1,               1.061
  2176,   2049,      2,         0,               0.971
  2176,   2049,      2,         1,               0.971
  2240,      0,      0,         0,               0.994
  2240,      0,      0,         1,               0.994
  2240,      3,      0,         0,               1.038
  2240,      3,      0,         1,               1.039
  2240,     35,      0,         0,               1.019
  2240,     35,      0,         1,               1.019
  2240,      0,      3,         0,               0.979
  2240,      0,      3,         1,                0.98
  2240,      0,     35,         0,               0.991
  2240,      0,     35,         1,               0.991
  2240,      3,      3,         0,               0.931
  2240,      3,      3,         1,               0.931
  2240,     35,     35,         0,               0.999
  2240,     35,     35,         1,               0.999
  2240,   2048,      0,         0,               0.995
  2240,   2048,      0,         1,               0.995
  2240,   2051,      0,         0,               1.039
  2240,   2051,      0,         1,               1.039
  2240,   2048,      3,         0,               0.799
  2240,   2048,      3,         1,               0.799
  2240,   2051,      3,         0,               0.889
  2240,   2051,      3,         1,               0.889
  2240,      3,      1,         0,                1.06
  2240,      3,      1,         1,                1.06
  2240,      1,      3,         0,               0.968
  2240,      1,      3,         1,               0.968
  2240,     35,      1,         0,               1.071
  2240,     35,      1,         1,               1.071
  2240,      1,     35,         0,               0.971
  2240,      1,     35,         1,               0.971
  2240,   2051,      1,         0,               1.057
  2240,   2051,      1,         1,               1.057
  2240,   2049,      3,         0,               0.966
  2240,   2049,      3,         1,               0.966
  2304,      0,      0,         0,               0.986
  2304,      0,      0,         1,               0.986
  2304,      4,      0,         0,               1.031
  2304,      4,      0,         1,               1.032
  2304,     36,      0,         0,               1.011
  2304,     36,      0,         1,               1.011
  2304,      0,      4,         0,               0.968
  2304,      0,      4,         1,               0.969
  2304,      0,     36,         0,               0.988
  2304,      0,     36,         1,               0.988
  2304,      4,      4,         0,                0.93
  2304,      4,      4,         1,               0.931
  2304,     36,     36,         0,               0.992
  2304,     36,     36,         1,               0.992
  2304,   2048,      0,         0,               0.988
  2304,   2048,      0,         1,               0.988
  2304,   2052,      0,         0,               1.032
  2304,   2052,      0,         1,               1.032
  2304,   2048,      4,         0,               0.793
  2304,   2048,      4,         1,               0.793
  2304,   2052,      4,         0,               0.884
  2304,   2052,      4,         1,               0.884
  2304,      4,      1,         0,               0.989
  2304,      4,      1,         1,               0.989
  2304,      1,      4,         0,               0.897
  2304,      1,      4,         1,               0.898
  2304,     36,      1,         0,               1.057
  2304,     36,      1,         1,               1.057
  2304,      1,     36,         0,               0.966
  2304,      1,     36,         1,               0.966
  2304,   2052,      1,         0,               1.052
  2304,   2052,      1,         1,               1.052
  2304,   2049,      4,         0,               0.955
  2304,   2049,      4,         1,               0.955
  2368,      0,      0,         0,                 1.0
  2368,      0,      0,         1,               1.001
  2368,      5,      0,         0,               1.024
  2368,      5,      0,         1,               1.025
  2368,     37,      0,         0,                 1.0
  2368,     37,      0,         1,                 1.0
  2368,      0,      5,         0,                0.98
  2368,      0,      5,         1,               0.981
  2368,      0,     37,         0,               0.983
  2368,      0,     37,         1,                0.98
  2368,      5,      5,         0,               0.944
  2368,      5,      5,         1,               0.944
  2368,     37,     37,         0,               1.003
  2368,     37,     37,         1,               1.003
  2368,   2048,      0,         0,               1.002
  2368,   2048,      0,         1,               1.002
  2368,   2053,      0,         0,               1.025
  2368,   2053,      0,         1,               1.025
  2368,   2048,      5,         0,               0.801
  2368,   2048,      5,         1,               0.801
  2368,   2053,      5,         0,               0.907
  2368,   2053,      5,         1,               0.907
  2368,      5,      1,         0,               1.071
  2368,      5,      1,         1,               1.071
  2368,      1,      5,         0,               0.973
  2368,      1,      5,         1,               0.973
  2368,     37,      1,         0,                1.07
  2368,     37,      1,         1,                1.07
  2368,      1,     37,         0,               0.974
  2368,      1,     37,         1,               0.974
  2368,   2053,      1,         0,               1.065
  2368,   2053,      1,         1,               1.065
  2368,   2049,      5,         0,               0.967
  2368,   2049,      5,         1,               0.967
  2432,      0,      0,         0,               0.965
  2432,      0,      0,         1,                 1.0
  2432,      6,      0,         0,               1.038
  2432,      6,      0,         1,               1.039
  2432,     38,      0,         0,               1.021
  2432,     38,      0,         1,               1.021
  2432,      0,      6,         0,               0.974
  2432,      0,      6,         1,               0.976
  2432,      0,     38,         0,               0.986
  2432,      0,     38,         1,               0.986
  2432,      6,      6,         0,               0.926
  2432,      6,      6,         1,               0.926
  2432,     38,     38,         0,                 1.0
  2432,     38,     38,         1,                 1.0
  2432,   2048,      0,         0,               1.004
  2432,   2048,      0,         1,               1.004
  2432,   2054,      0,         0,               1.039
  2432,   2054,      0,         1,               1.039
  2432,   2048,      6,         0,               0.797
  2432,   2048,      6,         1,               0.797
  2432,   2054,      6,         0,               0.898
  2432,   2054,      6,         1,               0.898
  2432,      6,      1,         0,               1.063
  2432,      6,      1,         1,               1.063
  2432,      1,      6,         0,               0.965
  2432,      1,      6,         1,               0.965
  2432,     38,      1,         0,               1.068
  2432,     38,      1,         1,               1.068
  2432,      1,     38,         0,               0.968
  2432,      1,     38,         1,               0.968
  2432,   2054,      1,         0,                1.06
  2432,   2054,      1,         1,                1.06
  2432,   2049,      6,         0,               0.963
  2432,   2049,      6,         1,               0.963
  2496,      0,      0,         0,               1.013
  2496,      0,      0,         1,               1.013
  2496,      7,      0,         0,               1.032
  2496,      7,      0,         1,               1.032
  2496,     39,      0,         0,               1.013
  2496,     39,      0,         1,               1.013
  2496,      0,      7,         0,               0.965
  2496,      0,      7,         1,               0.965
  2496,      0,     39,         0,               0.979
  2496,      0,     39,         1,               0.979
  2496,      7,      7,         0,               0.925
  2496,      7,      7,         1,               0.925
  2496,     39,     39,         0,               0.989
  2496,     39,     39,         1,               0.989
  2496,   2048,      0,         0,               1.013
  2496,   2048,      0,         1,               1.013
  2496,   2055,      0,         0,               1.032
  2496,   2055,      0,         1,               1.032
  2496,   2048,      7,         0,               0.792
  2496,   2048,      7,         1,               0.792
  2496,   2055,      7,         0,                0.93
  2496,   2055,      7,         1,                0.93
  2496,      7,      1,         0,               0.984
  2496,      7,      1,         1,               0.984
  2496,      1,      7,         0,               0.894
  2496,      1,      7,         1,               0.895
  2496,     39,      1,         0,               1.054
  2496,     39,      1,         1,               1.054
  2496,      1,     39,         0,               0.963
  2496,      1,     39,         1,               0.963
  2496,   2055,      1,         0,               1.049
  2496,   2055,      1,         1,               1.049
  2496,   2049,      7,         0,               0.953
  2496,   2049,      7,         1,               0.953
  2560,      0,      0,         0,               0.991
  2560,      0,      0,         1,               0.991
  2560,      8,      0,         0,               1.031
  2560,      8,      0,         1,               1.032
  2560,     40,      0,         0,               1.029
  2560,     40,      0,         1,               1.029
  2560,      0,      8,         0,               0.992
  2560,      0,      8,         1,               0.992
  2560,      0,     40,         0,               0.975
  2560,      0,     40,         1,               0.984
  2560,      8,      8,         0,               0.942
  2560,      8,      8,         1,               0.943
  2560,     40,     40,         0,               1.139
  2560,     40,     40,         1,               1.139
  2560,   2048,      0,         0,               0.993
  2560,   2048,      0,         1,               0.993
  2560,   2056,      0,         0,               1.032
  2560,   2056,      0,         1,               1.032
  2560,   2048,      8,         0,               0.812
  2560,   2048,      8,         1,               0.812
  2560,   2056,      8,         0,               0.912
  2560,   2056,      8,         1,               0.912
  2560,      8,      1,         0,               1.068
  2560,      8,      1,         1,               1.069
  2560,      1,      8,         0,               0.974
  2560,      1,      8,         1,               0.974
  2560,     40,      1,         0,               1.068
  2560,     40,      1,         1,               1.068
  2560,      1,     40,         0,               0.996
  2560,      1,     40,         1,               0.996
  2560,   2056,      1,         0,               1.063
  2560,   2056,      1,         1,               1.063
  2560,   2049,      8,         0,               0.969
  2560,   2049,      8,         1,               0.969
  2624,      0,      0,         0,               0.995
  2624,      0,      0,         1,               0.994
  2624,      9,      0,         0,               1.015
  2624,      9,      0,         1,               1.018
  2624,     41,      0,         0,               1.044
  2624,     41,      0,         1,               1.044
  2624,      0,      9,         0,               0.988
  2624,      0,      9,         1,                0.99
  2624,      0,     41,         0,               0.989
  2624,      0,     41,         1,                0.99
  2624,      9,      9,         0,               0.943
  2624,      9,      9,         1,               0.943
  2624,     41,     41,         0,               0.993
  2624,     41,     41,         1,               0.993
  2624,   2048,      0,         0,               0.998
  2624,   2048,      0,         1,               0.998
  2624,   2057,      0,         0,               1.018
  2624,   2057,      0,         1,               1.018
  2624,   2048,      9,         0,                0.81
  2624,   2048,      9,         1,                0.81
  2624,   2057,      9,         0,               0.907
  2624,   2057,      9,         1,               0.907
  2624,      9,      1,         0,                1.09
  2624,      9,      1,         1,                1.09
  2624,      1,      9,         0,               0.967
  2624,      1,      9,         1,               0.967
  2624,     41,      1,         0,               1.084
  2624,     41,      1,         1,               1.085
  2624,      1,     41,         0,               0.958
  2624,      1,     41,         1,               0.957
  2624,   2057,      1,         0,               1.087
  2624,   2057,      1,         1,               1.087
  2624,   2049,      9,         0,               0.965
  2624,   2049,      9,         1,               0.965
  2688,      0,      0,         0,               0.995
  2688,      0,      0,         1,               0.995
  2688,     10,      0,         0,                1.01
  2688,     10,      0,         1,               1.012
  2688,     42,      0,         0,               1.036
  2688,     42,      0,         1,               1.036
  2688,      0,     10,         0,               0.978
  2688,      0,     10,         1,               0.979
  2688,      0,     42,         0,               0.977
  2688,      0,     42,         1,               0.978
  2688,     10,     10,         0,               0.942
  2688,     10,     10,         1,               0.942
  2688,     42,     42,         0,               0.989
  2688,     42,     42,         1,               0.989
  2688,   2048,      0,         0,               0.995
  2688,   2048,      0,         1,               0.995
  2688,   2058,      0,         0,               1.012
  2688,   2058,      0,         1,               1.012
  2688,   2048,     10,         0,               0.804
  2688,   2048,     10,         1,               0.804
  2688,   2058,     10,         0,               0.905
  2688,   2058,     10,         1,               0.905
  2688,     10,      1,         0,               0.986
  2688,     10,      1,         1,               0.987
  2688,      1,     10,         0,               0.893
  2688,      1,     10,         1,               0.894
  2688,     42,      1,         0,               1.054
  2688,     42,      1,         1,               1.054
  2688,      1,     42,         0,               0.958
  2688,      1,     42,         1,               0.958
  2688,   2058,      1,         0,               1.052
  2688,   2058,      1,         1,               1.052
  2688,   2049,     10,         0,               0.954
  2688,   2049,     10,         1,               0.954
  2752,      0,      0,         0,                 1.0
  2752,      0,      0,         1,               0.992
  2752,     11,      0,         0,               0.954
  2752,     11,      0,         1,               0.954
  2752,     43,      0,         0,               0.979
  2752,     43,      0,         1,               0.979
  2752,      0,     11,         0,               0.939
  2752,      0,     11,         1,               0.939
  2752,      0,     43,         0,               0.931
  2752,      0,     43,         1,               0.932
  2752,     11,     11,         0,               0.949
  2752,     11,     11,         1,               0.949
  2752,     43,     43,         0,               1.007
  2752,     43,     43,         1,               1.007
  2752,   2048,      0,         0,               0.993
  2752,   2048,      0,         1,               0.993
  2752,   2059,      0,         0,               0.954
  2752,   2059,      0,         1,               0.954
  2752,   2048,     11,         0,                0.77
  2752,   2048,     11,         1,                0.77
  2752,   2059,     11,         0,               0.916
  2752,   2059,     11,         1,               0.916
  2752,     11,      1,         0,               0.994
  2752,     11,      1,         1,               0.994
  2752,      1,     11,         0,               0.928
  2752,      1,     11,         1,               0.928
  2752,     43,      1,         0,               1.022
  2752,     43,      1,         1,               1.022
  2752,      1,     43,         0,                0.92
  2752,      1,     43,         1,                0.92
  2752,   2059,      1,         0,               0.989
  2752,   2059,      1,         1,               0.989
  2752,   2049,     11,         0,               0.923
  2752,   2049,     11,         1,               0.923
  2816,      0,      0,         0,               1.003
  2816,      0,      0,         1,               1.003
  2816,     12,      0,         0,               0.897
  2816,     12,      0,         1,               0.894
  2816,     44,      0,         0,               0.914
  2816,     44,      0,         1,               0.914
  2816,      0,     12,         0,               0.876
  2816,      0,     12,         1,               0.874
  2816,      0,     44,         0,               0.871
  2816,      0,     44,         1,                0.87
  2816,     12,     12,         0,               0.948
  2816,     12,     12,         1,               0.948
  2816,     44,     44,         0,               1.009
  2816,     44,     44,         1,               1.009
  2816,   2048,      0,         0,               1.005
  2816,   2048,      0,         1,               1.005
  2816,   2060,      0,         0,               0.894
  2816,   2060,      0,         1,               0.894
  2816,   2048,     12,         0,               0.714
  2816,   2048,     12,         1,               0.713
  2816,   2060,     12,         0,               0.915
  2816,   2060,     12,         1,               0.915
  2816,     12,      1,         0,               0.917
  2816,     12,      1,         1,               0.917
  2816,      1,     12,         0,               0.858
  2816,      1,     12,         1,               0.857
  2816,     44,      1,         0,               0.944
  2816,     44,      1,         1,               0.943
  2816,      1,     44,         0,               0.856
  2816,      1,     44,         1,               0.856
  2816,   2060,      1,         0,               0.914
  2816,   2060,      1,         1,               0.914
  2816,   2049,     12,         0,               0.855
  2816,   2049,     12,         1,               0.855
  2880,      0,      0,         0,               0.989
  2880,      0,      0,         1,               0.989
  2880,     13,      0,         0,               0.967
  2880,     13,      0,         1,               0.967
  2880,     45,      0,         0,               0.987
  2880,     45,      0,         1,               0.987
  2880,      0,     13,         0,               0.925
  2880,      0,     13,         1,               0.925
  2880,      0,     45,         0,               0.927
  2880,      0,     45,         1,               0.927
  2880,     13,     13,         0,               0.944
  2880,     13,     13,         1,               0.944
  2880,     45,     45,         0,               1.003
  2880,     45,     45,         1,               1.003
  2880,   2048,      0,         0,               0.989
  2880,   2048,      0,         1,               0.989
  2880,   2061,      0,         0,               0.967
  2880,   2061,      0,         1,               0.967
  2880,   2048,     13,         0,                0.76
  2880,   2048,     13,         1,                0.76
  2880,   2061,     13,         0,                0.91
  2880,   2061,     13,         1,                0.91
  2880,     13,      1,         0,               0.922
  2880,     13,      1,         1,               0.922
  2880,      1,     13,         0,               0.859
  2880,      1,     13,         1,               0.859
  2880,     45,      1,         0,               1.013
  2880,     45,      1,         1,               1.013
  2880,      1,     45,         0,                0.92
  2880,      1,     45,         1,                0.92
  2880,   2061,      1,         0,               0.984
  2880,   2061,      1,         1,               0.984
  2880,   2049,     13,         0,               0.918
  2880,   2049,     13,         1,               0.918
  2944,      0,      0,         0,               1.014
  2944,      0,      0,         1,               1.014
  2944,     14,      0,         0,               0.956
  2944,     14,      0,         1,               0.955
  2944,     46,      0,         0,               0.979
  2944,     46,      0,         1,               0.979
  2944,      0,     14,         0,               0.937
  2944,      0,     14,         1,               0.937
  2944,      0,     46,         0,                0.93
  2944,      0,     46,         1,                0.93
  2944,     14,     14,         0,               0.953
  2944,     14,     14,         1,               0.953
  2944,     46,     46,         0,               1.009
  2944,     46,     46,         1,               1.009
  2944,   2048,      0,         0,               1.015
  2944,   2048,      0,         1,               1.015
  2944,   2062,      0,         0,               0.955
  2944,   2062,      0,         1,               0.955
  2944,   2048,     14,         0,               0.769
  2944,   2048,     14,         1,               0.769
  2944,   2062,     14,         0,               0.923
  2944,   2062,     14,         1,               0.923
  2944,     14,      1,         0,               0.994
  2944,     14,      1,         1,               0.994
  2944,      1,     14,         0,               0.927
  2944,      1,     14,         1,               0.927
  2944,     46,      1,         0,               1.021
  2944,     46,      1,         1,               1.021
  2944,      1,     46,         0,               0.923
  2944,      1,     46,         1,               0.923
  2944,   2062,      1,         0,               0.988
  2944,   2062,      1,         1,               0.988
  2944,   2049,     14,         0,               0.922
  2944,   2049,     14,         1,               0.922
  3008,      0,      0,         0,               0.994
  3008,      0,      0,         1,               0.994
  3008,     15,      0,         0,               0.941
  3008,     15,      0,         1,               0.941
  3008,     47,      0,         0,               0.996
  3008,     47,      0,         1,               0.996
  3008,      0,     15,         0,               0.929
  3008,      0,     15,         1,               0.933
  3008,      0,     47,         0,               0.933
  3008,      0,     47,         1,               0.933
  3008,     15,     15,         0,               0.952
  3008,     15,     15,         1,               0.949
  3008,     47,     47,         0,               1.003
  3008,     47,     47,         1,               1.003
  3008,   2048,      0,         0,               0.998
  3008,   2048,      0,         1,               0.998
  3008,   2063,      0,         0,               0.941
  3008,   2063,      0,         1,               0.941
  3008,   2048,     15,         0,               0.766
  3008,   2048,     15,         1,               0.766
  3008,   2063,     15,         0,               0.916
  3008,   2063,     15,         1,               0.916
  3008,     15,      1,         0,               0.985
  3008,     15,      1,         1,               0.985
  3008,      1,     15,         0,               0.916
  3008,      1,     15,         1,               0.916
  3008,     47,      1,         0,               1.014
  3008,     47,      1,         1,               1.014
  3008,      1,     47,         0,               0.902
  3008,      1,     47,         1,               0.902
  3008,   2063,      1,         0,               0.981
  3008,   2063,      1,         1,               0.981
  3008,   2049,     15,         0,               0.912
  3008,   2049,     15,         1,               0.913
  3072,      0,      0,         0,               1.016
  3072,      0,      0,         1,               1.015
  3072,     16,      0,         0,               1.045
  3072,     16,      0,         1,               1.045
  3072,     48,      0,         0,               1.045
  3072,     48,      0,         1,               1.045
  3072,      0,     16,         0,               1.049
  3072,      0,     16,         1,               1.049
  3072,      0,     48,         0,               1.049
  3072,      0,     48,         1,               1.049
  3072,     16,     16,         0,               1.016
  3072,     16,     16,         1,               1.016
  3072,     48,     48,         0,               1.016
  3072,     48,     48,         1,               1.016
  3072,   2048,      0,         0,               1.016
  3072,   2048,      0,         1,               1.016
  3072,   2064,      0,         0,               1.045
  3072,   2064,      0,         1,               1.045
  3072,   2048,     16,         0,               1.049
  3072,   2048,     16,         1,               1.049
  3072,   2064,     16,         0,               1.016
  3072,   2064,     16,         1,               1.016
  3072,     16,      1,         0,               0.815
  3072,     16,      1,         1,               0.815
  3072,      1,     16,         0,               0.872
  3072,      1,     16,         1,               0.872
  3072,     48,      1,         0,               1.017
  3072,     48,      1,         1,               1.017
  3072,      1,     48,         0,               0.872
  3072,      1,     48,         1,               0.872
  3072,   2064,      1,         0,               0.815
  3072,   2064,      1,         1,               0.815
  3072,   2049,     16,         0,               0.872
  3072,   2049,     16,         1,               0.872
  3136,      0,      0,         0,               0.995
  3136,      0,      0,         1,               0.995
  3136,     17,      0,         0,               0.949
  3136,     17,      0,         1,               0.949
  3136,     49,      0,         0,               0.987
  3136,     49,      0,         1,               0.987
  3136,      0,     17,         0,               0.919
  3136,      0,     17,         1,               0.917
  3136,      0,     49,         0,               0.931
  3136,      0,     49,         1,               0.931
  3136,     17,     17,         0,               1.122
  3136,     17,     17,         1,               1.119
  3136,     49,     49,         0,               0.987
  3136,     49,     49,         1,               0.987
  3136,   2048,      0,         0,               0.997
  3136,   2048,      0,         1,               0.997
  3136,   2065,      0,         0,               0.949
  3136,   2065,      0,         1,               0.949
  3136,   2048,     17,         0,               0.896
  3136,   2048,     17,         1,               0.896
  3136,   2065,     17,         0,               1.122
  3136,   2065,     17,         1,               1.119
  3136,     17,      1,         0,               1.184
  3136,     17,      1,         1,               1.184
  3136,      1,     17,         0,               1.124
  3136,      1,     17,         1,               1.125
  3136,     49,      1,         0,                1.11
  3136,     49,      1,         1,               1.108
  3136,      1,     49,         0,               1.044
  3136,      1,     49,         1,               1.044
  3136,   2065,      1,         0,               1.147
  3136,   2065,      1,         1,               1.147
  3136,   2049,     17,         0,               1.102
  3136,   2049,     17,         1,                 1.1
  3200,      0,      0,         0,               1.006
  3200,      0,      0,         1,               1.006
  3200,     18,      0,         0,               0.978
  3200,     18,      0,         1,               0.978
  3200,     50,      0,         0,               0.998
  3200,     50,      0,         1,               0.998
  3200,      0,     18,         0,               0.932
  3200,      0,     18,         1,               0.932
  3200,      0,     50,         0,                0.93
  3200,      0,     50,         1,                0.93
  3200,     18,     18,         0,                1.11
  3200,     18,     18,         1,                1.11
  3200,     50,     50,         0,               0.994
  3200,     50,     50,         1,               0.994
  3200,   2048,      0,         0,               1.007
  3200,   2048,      0,         1,               1.007
  3200,   2066,      0,         0,               0.978
  3200,   2066,      0,         1,               0.978
  3200,   2048,     18,         0,               0.894
  3200,   2048,     18,         1,               0.894
  3200,   2066,     18,         0,                1.11
  3200,   2066,     18,         1,                1.11
  3200,     18,      1,         0,               1.002
  3200,     18,      1,         1,               1.002
  3200,      1,     18,         0,               0.917
  3200,      1,     18,         1,               0.917
  3200,     50,      1,         0,               0.963
  3200,     50,      1,         1,               0.964
  3200,      1,     50,         0,               0.888
  3200,      1,     50,         1,               0.888
  3200,   2066,      1,         0,               1.002
  3200,   2066,      1,         1,               1.002
  3200,   2049,     18,         0,               0.914
  3200,   2049,     18,         1,               0.914
  3264,      0,      0,         0,               0.994
  3264,      0,      0,         1,               0.994
  3264,     19,      0,         0,               0.959
  3264,     19,      0,         1,               0.959
  3264,     51,      0,         0,               0.994
  3264,     51,      0,         1,               0.994
  3264,      0,     19,         0,               0.927
  3264,      0,     19,         1,               0.927
  3264,      0,     51,         0,               0.927
  3264,      0,     51,         1,               0.927
  3264,     19,     19,         0,                 1.1
  3264,     19,     19,         1,                 1.1
  3264,     51,     51,         0,               0.982
  3264,     51,     51,         1,               0.982
  3264,   2048,      0,         0,               0.994
  3264,   2048,      0,         1,               0.994
  3264,   2067,      0,         0,               0.959
  3264,   2067,      0,         1,               0.959
  3264,   2048,     19,         0,               0.891
  3264,   2048,     19,         1,               0.891
  3264,   2067,     19,         0,               1.099
  3264,   2067,     19,         1,               1.099
  3264,     19,      1,         0,               0.977
  3264,     19,      1,         1,               0.976
  3264,      1,     19,         0,               0.921
  3264,      1,     19,         1,               0.921
  3264,     51,      1,         0,               0.959
  3264,     51,      1,         1,               0.959
  3264,      1,     51,         0,               0.886
  3264,      1,     51,         1,               0.886
  3264,   2067,      1,         0,               0.976
  3264,   2067,      1,         1,               0.976
  3264,   2049,     19,         0,               0.917
  3264,   2049,     19,         1,               0.917
  3328,      0,      0,         0,               0.996
  3328,      0,      0,         1,               0.992
  3328,     20,      0,         0,               0.955
  3328,     20,      0,         1,               0.955
  3328,     52,      0,         0,                0.99
  3328,     52,      0,         1,                0.99
  3328,      0,     20,         0,               0.926
  3328,      0,     20,         1,               0.923
  3328,      0,     52,         0,               0.933
  3328,      0,     52,         1,               0.933
  3328,     20,     20,         0,                1.11
  3328,     20,     20,         1,                1.11
  3328,     52,     52,         0,               0.988
  3328,     52,     52,         1,               0.988
  3328,   2048,      0,         0,               0.993
  3328,   2048,      0,         1,               0.993
  3328,   2068,      0,         0,               0.955
  3328,   2068,      0,         1,               0.955
  3328,   2048,     20,         0,                 0.9
  3328,   2048,     20,         1,                 0.9
  3328,   2068,     20,         0,               1.109
  3328,   2068,     20,         1,               1.109
  3328,     20,      1,         0,                0.99
  3328,     20,      1,         1,                0.99
  3328,      1,     20,         0,               0.922
  3328,      1,     20,         1,               0.922
  3328,     52,      1,         0,               0.972
  3328,     52,      1,         1,               0.972
  3328,      1,     52,         0,               0.901
  3328,      1,     52,         1,               0.901
  3328,   2068,      1,         0,                0.99
  3328,   2068,      1,         1,                0.99
  3328,   2049,     20,         0,               0.918
  3328,   2049,     20,         1,               0.918
  3392,      0,      0,         0,               0.998
  3392,      0,      0,         1,                 1.0
  3392,     21,      0,         0,               0.964
  3392,     21,      0,         1,               0.964
  3392,     53,      0,         0,               0.998
  3392,     53,      0,         1,               0.998
  3392,      0,     21,         0,               0.932
  3392,      0,     21,         1,               0.932
  3392,      0,     53,         0,                0.93
  3392,      0,     53,         1,                0.93
  3392,     21,     21,         0,               1.113
  3392,     21,     21,         1,               1.113
  3392,     53,     53,         0,               0.983
  3392,     53,     53,         1,               0.983
  3392,   2048,      0,         0,                 1.0
  3392,   2048,      0,         1,                 1.0
  3392,   2069,      0,         0,               0.964
  3392,   2069,      0,         1,               0.964
  3392,   2048,     21,         0,               0.895
  3392,   2048,     21,         1,               0.896
  3392,   2069,     21,         0,               1.113
  3392,   2069,     21,         1,               1.113
  3392,     21,      1,         0,               0.994
  3392,     21,      1,         1,               0.994
  3392,      1,     21,         0,               0.923
  3392,      1,     21,         1,               0.923
  3392,     53,      1,         0,               0.972
  3392,     53,      1,         1,               0.972
  3392,      1,     53,         0,               0.891
  3392,      1,     53,         1,               0.891
  3392,   2069,      1,         0,               0.994
  3392,   2069,      1,         1,               0.994
  3392,   2049,     21,         0,               0.922
  3392,   2049,     21,         1,               0.922
  3456,      0,      0,         0,               0.995
  3456,      0,      0,         1,               0.995
  3456,     22,      0,         0,               0.965
  3456,     22,      0,         1,               0.965
  3456,     54,      0,         0,               0.996
  3456,     54,      0,         1,               0.996
  3456,      0,     22,         0,               0.927
  3456,      0,     22,         1,               0.927
  3456,      0,     54,         0,               0.927
  3456,      0,     54,         1,               0.927
  3456,     22,     22,         0,               1.107
  3456,     22,     22,         1,               1.107
  3456,     54,     54,         0,                0.98
  3456,     54,     54,         1,                0.98
  3456,   2048,      0,         0,               0.995
  3456,   2048,      0,         1,               0.995
  3456,   2070,      0,         0,               0.965
  3456,   2070,      0,         1,               0.965
  3456,   2048,     22,         0,               0.893
  3456,   2048,     22,         1,               0.893
  3456,   2070,     22,         0,               1.107
  3456,   2070,     22,         1,               1.107
  3456,     22,      1,         0,               0.988
  3456,     22,      1,         1,               0.988
  3456,      1,     22,         0,               0.921
  3456,      1,     22,         1,               0.921
  3456,     54,      1,         0,               0.963
  3456,     54,      1,         1,               0.963
  3456,      1,     54,         0,               0.887
  3456,      1,     54,         1,               0.887
  3456,   2070,      1,         0,               0.988
  3456,   2070,      1,         1,               0.988
  3456,   2049,     22,         0,               0.917
  3456,   2049,     22,         1,               0.917
  3520,      0,      0,         0,               1.016
  3520,      0,      0,         1,               1.016
  3520,     23,      0,         0,               0.957
  3520,     23,      0,         1,               0.957
  3520,     55,      0,         0,               0.991
  3520,     55,      0,         1,               0.991
  3520,      0,     23,         0,               0.919
  3520,      0,     23,         1,               0.924
  3520,      0,     55,         0,               0.934
  3520,      0,     55,         1,               0.934
  3520,     23,     23,         0,               1.111
  3520,     23,     23,         1,               1.111
  3520,     55,     55,         0,               0.994
  3520,     55,     55,         1,               0.994
  3520,   2048,      0,         0,               1.016
  3520,   2048,      0,         1,               1.016
  3520,   2071,      0,         0,               0.957
  3520,   2071,      0,         1,               0.957
  3520,   2048,     23,         0,               0.903
  3520,   2048,     23,         1,               0.903
  3520,   2071,     23,         0,               1.111
  3520,   2071,     23,         1,               1.111
  3520,     23,      1,         0,               0.997
  3520,     23,      1,         1,               0.997
  3520,      1,     23,         0,               0.921
  3520,      1,     23,         1,               0.921
  3520,     55,      1,         0,               0.976
  3520,     55,      1,         1,               0.976
  3520,      1,     55,         0,               0.902
  3520,      1,     55,         1,               0.902
  3520,   2071,      1,         0,               0.997
  3520,   2071,      1,         1,               0.997
  3520,   2049,     23,         0,               0.918
  3520,   2049,     23,         1,               0.918
  3584,      0,      0,         0,               1.004
  3584,      0,      0,         1,               1.004
  3584,     24,      0,         0,               0.985
  3584,     24,      0,         1,               0.979
  3584,     56,      0,         0,               1.006
  3584,     56,      0,         1,               1.006
  3584,      0,     24,         0,               0.931
  3584,      0,     24,         1,               0.931
  3584,      0,     56,         0,                0.93
  3584,      0,     56,         1,                0.93
  3584,     24,     24,         0,               1.111
  3584,     24,     24,         1,                1.11
  3584,     56,     56,         0,               1.101
  3584,     56,     56,         1,                 1.1
  3584,   2048,      0,         0,               1.005
  3584,   2048,      0,         1,               1.005
  3584,   2072,      0,         0,                0.98
  3584,   2072,      0,         1,               0.978
  3584,   2048,     24,         0,               0.896
  3584,   2048,     24,         1,               0.897
  3584,   2072,     24,         0,               1.111
  3584,   2072,     24,         1,               1.111
  3584,     24,      1,         0,               1.004
  3584,     24,      1,         1,               1.004
  3584,      1,     24,         0,               0.921
  3584,      1,     24,         1,               0.921
  3584,     56,      1,         0,               0.971
  3584,     56,      1,         1,                0.97
  3584,      1,     56,         0,                0.89
  3584,      1,     56,         1,                0.89
  3584,   2072,      1,         0,               1.004
  3584,   2072,      1,         1,               1.004
  3584,   2049,     24,         0,               0.918
  3584,   2049,     24,         1,               0.918
  3648,      0,      0,         0,               1.012
  3648,      0,      0,         1,               1.012
  3648,     25,      0,         0,                0.96
  3648,     25,      0,         1,                0.96
  3648,     57,      0,         0,               0.988
  3648,     57,      0,         1,               0.988
  3648,      0,     25,         0,               0.927
  3648,      0,     25,         1,               0.927
  3648,      0,     57,         0,               0.927
  3648,      0,     57,         1,               0.927
  3648,     25,     25,         0,               1.101
  3648,     25,     25,         1,               1.101
  3648,     57,     57,         0,               0.986
  3648,     57,     57,         1,               0.986
  3648,   2048,      0,         0,               1.012
  3648,   2048,      0,         1,               1.012
  3648,   2073,      0,         0,                0.96
  3648,   2073,      0,         1,               0.959
  3648,   2048,     25,         0,               0.894
  3648,   2048,     25,         1,               0.895
  3648,   2073,     25,         0,               1.103
  3648,   2073,     25,         1,               1.103
  3648,     25,      1,         0,               1.024
  3648,     25,      1,         1,               1.024
  3648,      1,     25,         0,               0.911
  3648,      1,     25,         1,               0.912
  3648,     57,      1,         0,               0.973
  3648,     57,      1,         1,               0.974
  3648,      1,     57,         0,               0.888
  3648,      1,     57,         1,               0.888
  3648,   2073,      1,         0,               1.024
  3648,   2073,      1,         1,               1.024
  3648,   2049,     25,         0,               0.907
  3648,   2049,     25,         1,               0.907
  3712,      0,      0,         0,               0.996
  3712,      0,      0,         1,               0.996
  3712,     26,      0,         0,                0.96
  3712,     26,      0,         1,                0.96
  3712,     58,      0,         0,               0.995
  3712,     58,      0,         1,               0.995
  3712,      0,     26,         0,               0.919
  3712,      0,     26,         1,               0.918
  3712,      0,     58,         0,                0.93
  3712,      0,     58,         1,                0.93
  3712,     26,     26,         0,               1.103
  3712,     26,     26,         1,               1.102
  3712,     58,     58,         0,               0.989
  3712,     58,     58,         1,               0.989
  3712,   2048,      0,         0,               0.997
  3712,   2048,      0,         1,               0.997
  3712,   2074,      0,         0,               0.959
  3712,   2074,      0,         1,               0.959
  3712,   2048,     26,         0,               0.901
  3712,   2048,     26,         1,               0.901
  3712,   2074,     26,         0,               1.104
  3712,   2074,     26,         1,               1.102
  3712,     26,      1,         0,               1.001
  3712,     26,      1,         1,               1.001
  3712,      1,     26,         0,               0.922
  3712,      1,     26,         1,               0.922
  3712,     58,      1,         0,               0.974
  3712,     58,      1,         1,               0.974
  3712,      1,     58,         0,               0.903
  3712,      1,     58,         1,               0.903
  3712,   2074,      1,         0,               1.001
  3712,   2074,      1,         1,               1.001
  3712,   2049,     26,         0,               0.919
  3712,   2049,     26,         1,               0.919
  3776,      0,      0,         0,               1.003
  3776,      0,      0,         1,               1.003
  3776,     27,      0,         0,               0.964
  3776,     27,      0,         1,               0.964
  3776,     59,      0,         0,               1.004
  3776,     59,      0,         1,               1.004
  3776,      0,     27,         0,               0.931
  3776,      0,     27,         1,               0.931
  3776,      0,     59,         0,               0.929
  3776,      0,     59,         1,                0.93
  3776,     27,     27,         0,               1.097
  3776,     27,     27,         1,               1.097
  3776,     59,     59,         0,               0.992
  3776,     59,     59,         1,               0.992
  3776,   2048,      0,         0,               1.003
  3776,   2048,      0,         1,               1.003
  3776,   2075,      0,         0,               0.963
  3776,   2075,      0,         1,               0.964
  3776,   2048,     27,         0,               0.898
  3776,   2048,     27,         1,               0.898
  3776,   2075,     27,         0,               1.097
  3776,   2075,     27,         1,               1.097
  3776,     27,      1,         0,               0.998
  3776,     27,      1,         1,               0.998
  3776,      1,     27,         0,               0.925
  3776,      1,     27,         1,               0.925
  3776,     59,      1,         0,               0.979
  3776,     59,      1,         1,               0.979
  3776,      1,     59,         0,               0.894
  3776,      1,     59,         1,               0.894
  3776,   2075,      1,         0,               0.998
  3776,   2075,      1,         1,               0.999
  3776,   2049,     27,         0,               0.923
  3776,   2049,     27,         1,               0.923
  3840,      0,      0,         0,               0.997
  3840,      0,      0,         1,               0.997
  3840,     28,      0,         0,               0.968
  3840,     28,      0,         1,               0.968
  3840,     60,      0,         0,               1.001
  3840,     60,      0,         1,               1.001
  3840,      0,     28,         0,               0.926
  3840,      0,     28,         1,               0.927
  3840,      0,     60,         0,               0.927
  3840,      0,     60,         1,               0.927
  3840,     28,     28,         0,               1.094
  3840,     28,     28,         1,               1.094
  3840,     60,     60,         0,               0.982
  3840,     60,     60,         1,               0.982
  3840,   2048,      0,         0,               0.998
  3840,   2048,      0,         1,               0.998
  3840,   2076,      0,         0,               0.968
  3840,   2076,      0,         1,               0.968
  3840,   2048,     28,         0,               0.896
  3840,   2048,     28,         1,               0.896
  3840,   2076,     28,         0,               1.094
  3840,   2076,     28,         1,               1.094
  3840,     28,      1,         0,               0.983
  3840,     28,      1,         1,               0.982
  3840,      1,     28,         0,               0.916
  3840,      1,     28,         1,               0.916
  3840,     60,      1,         0,               0.969
  3840,     60,      1,         1,               0.969
  3840,      1,     60,         0,               0.891
  3840,      1,     60,         1,               0.891
  3840,   2076,      1,         0,               0.983
  3840,   2076,      1,         1,               0.983
  3840,   2049,     28,         0,               0.912
  3840,   2049,     28,         1,               0.912
  3904,      0,      0,         0,               1.002
  3904,      0,      0,         1,                 1.0
  3904,     29,      0,         0,               0.961
  3904,     29,      0,         1,               0.961
  3904,     61,      0,         0,               0.997
  3904,     61,      0,         1,               0.997
  3904,      0,     29,         0,               0.915
  3904,      0,     29,         1,               0.922
  3904,      0,     61,         0,               0.933
  3904,      0,     61,         1,               0.933
  3904,     29,     29,         0,               1.103
  3904,     29,     29,         1,               1.103
  3904,     61,     61,         0,               0.995
  3904,     61,     61,         1,               0.995
  3904,   2048,      0,         0,               0.998
  3904,   2048,      0,         1,                 1.0
  3904,   2077,      0,         0,               0.961
  3904,   2077,      0,         1,               0.961
  3904,   2048,     29,         0,               0.904
  3904,   2048,     29,         1,               0.904
  3904,   2077,     29,         0,               1.103
  3904,   2077,     29,         1,               1.103
  3904,     29,      1,         0,                 1.0
  3904,     29,      1,         1,                 1.0
  3904,      1,     29,         0,               0.922
  3904,      1,     29,         1,               0.922
  3904,     61,      1,         0,                0.98
  3904,     61,      1,         1,                0.98
  3904,      1,     61,         0,               0.904
  3904,      1,     61,         1,               0.904
  3904,   2077,      1,         0,                 1.0
  3904,   2077,      1,         1,                 1.0
  3904,   2049,     29,         0,               0.919
  3904,   2049,     29,         1,               0.919
  3968,      0,      0,         0,               1.003
  3968,      0,      0,         1,               1.003
  3968,     30,      0,         0,               0.969
  3968,     30,      0,         1,               0.969
  3968,     62,      0,         0,               1.006
  3968,     62,      0,         1,               1.006
  3968,      0,     30,         0,               0.931
  3968,      0,     30,         1,                0.93
  3968,      0,     62,         0,               0.929
  3968,      0,     62,         1,               0.929
  3968,     30,     30,         0,               1.103
  3968,     30,     30,         1,               1.103
  3968,     62,     62,         0,                0.99
  3968,     62,     62,         1,                0.99
  3968,   2048,      0,         0,               1.004
  3968,   2048,      0,         1,               1.004
  3968,   2078,      0,         0,               0.969
  3968,   2078,      0,         1,               0.969
  3968,   2048,     30,         0,               0.899
  3968,   2048,     30,         1,               0.899
  3968,   2078,     30,         0,               1.105
  3968,   2078,     30,         1,               1.105
  3968,     30,      1,         0,               0.993
  3968,     30,      1,         1,               0.993
  3968,      1,     30,         0,               0.908
  3968,      1,     30,         1,               0.908
  3968,     62,      1,         0,               0.978
  3968,     62,      1,         1,               0.978
  3968,      1,     62,         0,               0.895
  3968,      1,     62,         1,               0.895
  3968,   2078,      1,         0,               0.993
  3968,   2078,      1,         1,               0.993
  3968,   2049,     30,         0,               0.904
  3968,   2049,     30,         1,               0.904
  4032,      0,      0,         0,               0.995
  4032,      0,      0,         1,               0.995
  4032,     31,      0,         0,               0.967
  4032,     31,      0,         1,               0.967
  4032,     63,      0,         0,               1.002
  4032,     63,      0,         1,               1.002
  4032,      0,     31,         0,               0.927
  4032,      0,     31,         1,               0.926
  4032,      0,     63,         0,               0.927
  4032,      0,     63,         1,               0.927
  4032,     31,     31,         0,                1.09
  4032,     31,     31,         1,                1.09
  4032,     63,     63,         0,               0.987
  4032,     63,     63,         1,               0.987
  4032,   2048,      0,         0,               0.995
  4032,   2048,      0,         1,               0.995
  4032,   2079,      0,         0,               0.967
  4032,   2079,      0,         1,               0.967
  4032,   2048,     31,         0,               0.897
  4032,   2048,     31,         1,               0.897
  4032,   2079,     31,         0,                1.09
  4032,   2079,     31,         1,                1.09
  4032,     31,      1,         0,               0.989
  4032,     31,      1,         1,               0.989
  4032,      1,     31,         0,               0.911
  4032,      1,     31,         1,               0.911
  4032,     63,      1,         0,               0.971
  4032,     63,      1,         1,               0.972
  4032,      1,     63,         0,               0.892
  4032,      1,     63,         1,               0.892
  4032,   2079,      1,         0,               0.989
  4032,   2079,      1,         1,               0.989
  4032,   2049,     31,         0,               0.907
  4032,   2049,     31,         1,               0.907
  4096,     32,      0,         0,               1.014
  4096,     32,      0,         1,               1.014
  4096,     64,      0,         0,               1.014
  4096,     64,      0,         1,               1.014
  4096,      0,     32,         0,               1.012
  4096,      0,     32,         1,               1.012
  4096,      0,     64,         0,               1.012
  4096,      0,     64,         1,               1.012
  4096,     32,     32,         0,               1.014
  4096,     32,     32,         1,               1.014
  4096,     64,     64,         0,               1.014
  4096,     64,     64,         1,               1.014
  4096,   2080,      0,         0,               1.014
  4096,   2080,      0,         1,               1.014
  4096,   2048,     32,         0,               1.014
  4096,   2048,     32,         1,               1.014
  4096,   2080,     32,         0,               1.014
  4096,   2080,     32,         1,               1.014
  4096,     32,      1,         0,               0.975
  4096,     32,      1,         1,               0.975
  4096,      1,     32,         0,               0.769
  4096,      1,     32,         1,               0.769
  4096,     64,      1,         0,               0.858
  4096,     64,      1,         1,               0.858
  4096,      1,     64,         0,               0.769
  4096,      1,     64,         1,               0.769
  4096,   2080,      1,         0,               0.829
  4096,   2080,      1,         1,               0.829
  4096,   2049,     32,         0,               0.886
  4096,   2049,     32,         1,               0.886
  4160,      0,      0,         0,               1.003
  4160,      0,      0,         1,               1.003
  4160,     33,      0,         0,               1.004
  4160,     33,      0,         1,               1.004
  4160,     65,      0,         0,               0.999
  4160,     65,      0,         1,               0.999
  4160,      0,     33,         0,               0.931
  4160,      0,     33,         1,               0.931
  4160,      0,     65,         0,               0.765
  4160,      0,     65,         1,               0.765
  4160,     33,     33,         0,               0.998
  4160,     33,     33,         1,               0.998
  4160,     65,     65,         0,               0.942
  4160,     65,     65,         1,               0.942
  4160,   2048,      0,         0,               1.003
  4160,   2048,      0,         1,               1.003
  4160,   2081,      0,         0,               1.004
  4160,   2081,      0,         1,               1.004
  4160,   2048,     33,         0,               0.899
  4160,   2048,     33,         1,               0.898
  4160,   2081,     33,         0,               1.002
  4160,   2081,     33,         1,               1.002
  4160,     33,      1,         0,               1.114
  4160,     33,      1,         1,               1.114
  4160,      1,     33,         0,                1.01
  4160,      1,     33,         1,                1.01
  4160,     65,      1,         0,               1.077
  4160,     65,      1,         1,               1.077
  4160,      1,     65,         0,               0.935
  4160,      1,     65,         1,               0.935
  4160,   2081,      1,         0,               1.077
  4160,   2081,      1,         1,               1.077
  4160,   2049,     33,         0,               1.007
  4160,   2049,     33,         1,               1.007
  4224,      0,      0,         0,               1.014
  4224,      0,      0,         1,               1.014
  4224,     34,      0,         0,                 1.0
  4224,     34,      0,         1,                 1.0
  4224,     66,      0,         0,               1.001
  4224,     66,      0,         1,               1.001
  4224,      0,     34,         0,               0.928
  4224,      0,     34,         1,               0.928
  4224,      0,     66,         0,               0.762
  4224,      0,     66,         1,               0.762
  4224,     34,     34,         0,               0.998
  4224,     34,     34,         1,               0.998
  4224,     66,     66,         0,               0.959
  4224,     66,     66,         1,               0.959
  4224,   2048,      0,         0,               1.014
  4224,   2048,      0,         1,               1.014
  4224,   2082,      0,         0,               1.001
  4224,   2082,      0,         1,               1.001
  4224,   2048,     34,         0,               0.899
  4224,   2048,     34,         1,               0.898
  4224,   2082,     34,         0,               0.998
  4224,   2082,     34,         1,               0.998
  4224,     34,      1,         0,               1.024
  4224,     34,      1,         1,               1.023
  4224,      1,     34,         0,               0.917
  4224,      1,     34,         1,               0.917
  4224,     66,      1,         0,               1.012
  4224,     66,      1,         1,               1.013
  4224,      1,     66,         0,               0.917
  4224,      1,     66,         1,               0.917
  4224,   2082,      1,         0,               1.022
  4224,   2082,      1,         1,               1.022
  4224,   2049,     34,         0,               0.914
  4224,   2049,     34,         1,               0.914
  4288,      0,      0,         0,               0.999
  4288,      0,      0,         1,               0.999
  4288,     35,      0,         0,               0.995
  4288,     35,      0,         1,               0.996
  4288,     67,      0,         0,               0.998
  4288,     67,      0,         1,               0.998
  4288,      0,     35,         0,               0.919
  4288,      0,     35,         1,               0.918
  4288,      0,     67,         0,               0.767
  4288,      0,     67,         1,               0.767
  4288,     35,     35,         0,               1.005
  4288,     35,     35,         1,               1.004
  4288,     67,     67,         0,               0.995
  4288,     67,     67,         1,               0.995
  4288,   2048,      0,         0,               0.999
  4288,   2048,      0,         1,               0.999
  4288,   2083,      0,         0,               0.995
  4288,   2083,      0,         1,               0.995
  4288,   2048,     35,         0,               0.905
  4288,   2048,     35,         1,               0.904
  4288,   2083,     35,         0,               1.005
  4288,   2083,     35,         1,               1.004
  4288,     35,      1,         0,               1.033
  4288,     35,      1,         1,               1.032
  4288,      1,     35,         0,               0.928
  4288,      1,     35,         1,               0.928
  4288,     67,      1,         0,               1.019
  4288,     67,      1,         1,                1.02
  4288,      1,     67,         0,               0.925
  4288,      1,     67,         1,               0.924
  4288,   2083,      1,         0,                1.03
  4288,   2083,      1,         1,                1.03
  4288,   2049,     35,         0,               0.925
  4288,   2049,     35,         1,               0.926
  4352,      0,      0,         0,               1.005
  4352,      0,      0,         1,               1.005
  4352,     36,      0,         0,               1.007
  4352,     36,      0,         1,               1.006
  4352,     68,      0,         0,               1.007
  4352,     68,      0,         1,               1.008
  4352,      0,     36,         0,               0.929
  4352,      0,     36,         1,               0.929
  4352,      0,     68,         0,               0.766
  4352,      0,     68,         1,               0.766
  4352,     36,     36,         0,               0.998
  4352,     36,     36,         1,               0.998
  4352,     68,     68,         0,               0.964
  4352,     68,     68,         1,               0.964
  4352,   2048,      0,         0,               1.006
  4352,   2048,      0,         1,               1.006
  4352,   2084,      0,         0,               1.006
  4352,   2084,      0,         1,               1.006
  4352,   2048,     36,         0,               0.897
  4352,   2048,     36,         1,               0.898
  4352,   2084,     36,         0,               0.998
  4352,   2084,     36,         1,               0.998
  4352,     36,      1,         0,               1.031
  4352,     36,      1,         1,               1.031
  4352,      1,     36,         0,               0.924
  4352,      1,     36,         1,               0.924
  4352,     68,      1,         0,               0.999
  4352,     68,      1,         1,               0.999
  4352,      1,     68,         0,               0.922
  4352,      1,     68,         1,               0.922
  4352,   2084,      1,         0,                1.03
  4352,   2084,      1,         1,                1.03
  4352,   2049,     36,         0,               0.922
  4352,   2049,     36,         1,               0.922
  4416,      0,      0,         0,               0.997
  4416,      0,      0,         1,               0.997
  4416,     37,      0,         0,               1.002
  4416,     37,      0,         1,               1.002
  4416,     69,      0,         0,               1.004
  4416,     69,      0,         1,               1.004
  4416,      0,     37,         0,               0.928
  4416,      0,     37,         1,               0.927
  4416,      0,     69,         0,               0.762
  4416,      0,     69,         1,               0.762
  4416,     37,     37,         0,               0.994
  4416,     37,     37,         1,               0.994
  4416,     69,     69,         0,               0.959
  4416,     69,     69,         1,               0.959
  4416,   2048,      0,         0,               0.997
  4416,   2048,      0,         1,               0.997
  4416,   2085,      0,         0,               1.001
  4416,   2085,      0,         1,               1.001
  4416,   2048,     37,         0,               0.899
  4416,   2048,     37,         1,               0.899
  4416,   2085,     37,         0,               0.994
  4416,   2085,     37,         1,               0.994
  4416,     37,      1,         0,               1.024
  4416,     37,      1,         1,               1.023
  4416,      1,     37,         0,               0.923
  4416,      1,     37,         1,               0.922
  4416,     69,      1,         0,               1.009
  4416,     69,      1,         1,                1.01
  4416,      1,     69,         0,               0.917
  4416,      1,     69,         1,               0.917
  4416,   2085,      1,         0,               1.024
  4416,   2085,      1,         1,               1.024
  4416,   2049,     37,         0,               0.919
  4416,   2049,     37,         1,               0.919
  4480,      0,      0,         0,                 1.0
  4480,      0,      0,         1,               0.999
  4480,     38,      0,         0,               0.996
  4480,     38,      0,         1,               0.996
  4480,     70,      0,         0,                 1.0
  4480,     70,      0,         1,                 1.0
  4480,      0,     38,         0,               0.919
  4480,      0,     38,         1,               0.921
  4480,      0,     70,         0,               0.767
  4480,      0,     70,         1,               0.767
  4480,     38,     38,         0,               1.002
  4480,     38,     38,         1,               1.002
  4480,     70,     70,         0,               0.963
  4480,     70,     70,         1,               0.963
  4480,   2048,      0,         0,               0.998
  4480,   2048,      0,         1,               0.999
  4480,   2086,      0,         0,               0.996
  4480,   2086,      0,         1,               0.995
  4480,   2048,     38,         0,               0.907
  4480,   2048,     38,         1,               0.907
  4480,   2086,     38,         0,               1.002
  4480,   2086,     38,         1,               1.002
  4480,     38,      1,         0,               1.032
  4480,     38,      1,         1,               1.031
  4480,      1,     38,         0,               0.919
  4480,      1,     38,         1,                0.92
  4480,     70,      1,         0,               1.018
  4480,     70,      1,         1,               1.017
  4480,      1,     70,         0,               0.916
  4480,      1,     70,         1,               0.915
  4480,   2086,      1,         0,               1.031
  4480,   2086,      1,         1,                1.03
  4480,   2049,     38,         0,               0.917
  4480,   2049,     38,         1,               0.918
  4544,      0,      0,         0,               1.002
  4544,      0,      0,         1,               1.002
  4544,     39,      0,         0,               1.007
  4544,     39,      0,         1,               1.008
  4544,     71,      0,         0,               1.002
  4544,     71,      0,         1,               1.002
  4544,      0,     39,         0,                0.93
  4544,      0,     39,         1,               0.931
  4544,      0,     71,         0,               0.766
  4544,      0,     71,         1,               0.766
  4544,     39,     39,         0,               1.001
  4544,     39,     39,         1,               1.001
  4544,     71,     71,         0,               0.966
  4544,     71,     71,         1,               0.966
  4544,   2048,      0,         0,               1.002
  4544,   2048,      0,         1,               1.002
  4544,   2087,      0,         0,               1.008
  4544,   2087,      0,         1,               1.007
  4544,   2048,     39,         0,               0.901
  4544,   2048,     39,         1,               0.901
  4544,   2087,     39,         0,               1.001
  4544,   2087,     39,         1,               1.001
  4544,     39,      1,         0,               1.025
  4544,     39,      1,         1,               1.025
  4544,      1,     39,         0,               0.919
  4544,      1,     39,         1,               0.919
  4544,     71,      1,         0,               0.991
  4544,     71,      1,         1,               0.991
  4544,      1,     71,         0,               0.921
  4544,      1,     71,         1,               0.922
  4544,   2087,      1,         0,               1.025
  4544,   2087,      1,         1,               1.025
  4544,   2049,     39,         0,               0.917
  4544,   2049,     39,         1,               0.917
  4608,      0,      0,         0,               0.997
  4608,      0,      0,         1,               0.997
  4608,     40,      0,         0,               1.013
  4608,     40,      0,         1,               1.013
  4608,     72,      0,         0,               1.013
  4608,     72,      0,         1,               1.013
  4608,      0,     40,         0,               0.925
  4608,      0,     40,         1,               0.926
  4608,      0,     72,         0,               0.765
  4608,      0,     72,         1,               0.765
  4608,     40,     40,         0,               1.084
  4608,     40,     40,         1,               1.084
  4608,     72,     72,         0,               0.966
  4608,     72,     72,         1,               0.966
  4608,   2048,      0,         0,               0.999
  4608,   2048,      0,         1,               0.999
  4608,   2088,      0,         0,               1.012
  4608,   2088,      0,         1,               1.012
  4608,   2048,     40,         0,               0.898
  4608,   2048,     40,         1,               0.898
  4608,   2088,     40,         0,               1.087
  4608,   2088,     40,         1,               1.087
  4608,     40,      1,         0,               1.006
  4608,     40,      1,         1,               1.006
  4608,      1,     40,         0,               0.926
  4608,      1,     40,         1,               0.925
  4608,     72,      1,         0,               1.012
  4608,     72,      1,         1,               1.011
  4608,      1,     72,         0,                0.92
  4608,      1,     72,         1,                0.92
  4608,   2088,      1,         0,               1.006
  4608,   2088,      1,         1,               1.006
  4608,   2049,     40,         0,               0.923
  4608,   2049,     40,         1,               0.923
  4672,      0,      0,         0,               1.014
  4672,      0,      0,         1,               1.014
  4672,     41,      0,         0,               1.003
  4672,     41,      0,         1,               1.003
  4672,     73,      0,         0,               0.983
  4672,     73,      0,         1,               0.982
  4672,      0,     41,         0,               0.916
  4672,      0,     41,         1,               0.918
  4672,      0,     73,         0,               0.772
  4672,      0,     73,         1,               0.772
  4672,     41,     41,         0,               1.012
  4672,     41,     41,         1,               1.012
  4672,     73,     73,         0,               0.973
  4672,     73,     73,         1,               0.973
  4672,   2048,      0,         0,               1.014
  4672,   2048,      0,         1,               1.014
  4672,   2089,      0,         0,               1.002
  4672,   2089,      0,         1,               1.002
  4672,   2048,     41,         0,               0.907
  4672,   2048,     41,         1,               0.908
  4672,   2089,     41,         0,               1.012
  4672,   2089,     41,         1,               1.012
  4672,     41,      1,         0,               1.027
  4672,     41,      1,         1,               1.027
  4672,      1,     41,         0,               0.928
  4672,      1,     41,         1,               0.927
  4672,     73,      1,         0,               1.032
  4672,     73,      1,         1,                1.03
  4672,      1,     73,         0,               0.927
  4672,      1,     73,         1,               0.927
  4672,   2089,      1,         0,               1.026
  4672,   2089,      1,         1,               1.027
  4672,   2049,     41,         0,               0.925
  4672,   2049,     41,         1,               0.925
  4736,      0,      0,         0,               1.005
  4736,      0,      0,         1,               1.005
  4736,     42,      0,         0,               1.012
  4736,     42,      0,         1,               1.012
  4736,     74,      0,         0,               0.976
  4736,     74,      0,         1,               0.975
  4736,      0,     42,         0,                0.93
  4736,      0,     42,         1,                0.93
  4736,      0,     74,         0,                0.77
  4736,      0,     74,         1,                0.77
  4736,     42,     42,         0,               1.007
  4736,     42,     42,         1,               1.007
  4736,     74,     74,         0,               0.965
  4736,     74,     74,         1,               0.965
  4736,   2048,      0,         0,               1.006
  4736,   2048,      0,         1,               1.006
  4736,   2090,      0,         0,               1.013
  4736,   2090,      0,         1,               1.013
  4736,   2048,     42,         0,               0.902
  4736,   2048,     42,         1,               0.902
  4736,   2090,     42,         0,               1.007
  4736,   2090,     42,         1,               1.007
  4736,     42,      1,         0,               1.032
  4736,     42,      1,         1,               1.032
  4736,      1,     42,         0,               0.925
  4736,      1,     42,         1,               0.925
  4736,     74,      1,         0,               1.018
  4736,     74,      1,         1,               1.018
  4736,      1,     74,         0,               0.912
  4736,      1,     74,         1,               0.912
  4736,   2090,      1,         0,               1.032
  4736,   2090,      1,         1,               1.032
  4736,   2049,     42,         0,               0.923
  4736,   2049,     42,         1,               0.923
  4800,      0,      0,         0,               1.012
  4800,      0,      0,         1,               1.012
  4800,     43,      0,         0,               1.008
  4800,     43,      0,         1,               1.008
  4800,     75,      0,         0,                0.99
  4800,     75,      0,         1,                0.99
  4800,      0,     43,         0,               0.928
  4800,      0,     43,         1,               0.928
  4800,      0,     75,         0,               0.767
  4800,      0,     75,         1,               0.768
  4800,     43,     43,         0,               1.004
  4800,     43,     43,         1,               1.004
  4800,     75,     75,         0,               0.965
  4800,     75,     75,         1,               0.965
  4800,   2048,      0,         0,               1.012
  4800,   2048,      0,         1,               1.012
  4800,   2091,      0,         0,               1.009
  4800,   2091,      0,         1,               1.008
  4800,   2048,     43,         0,               0.902
  4800,   2048,     43,         1,               0.902
  4800,   2091,     43,         0,               1.004
  4800,   2091,     43,         1,               1.004
  4800,     43,      1,         0,               1.026
  4800,     43,      1,         1,               1.025
  4800,      1,     43,         0,                0.91
  4800,      1,     43,         1,                0.91
  4800,     75,      1,         0,               0.992
  4800,     75,      1,         1,               0.992
  4800,      1,     75,         0,               0.921
  4800,      1,     75,         1,                0.92
  4800,   2091,      1,         0,               1.025
  4800,   2091,      1,         1,               1.025
  4800,   2049,     43,         0,               0.907
  4800,   2049,     43,         1,               0.907
  4864,      0,      0,         0,               0.998
  4864,      0,      0,         1,               0.998
  4864,     44,      0,         0,               1.003
  4864,     44,      0,         1,               1.004
  4864,     76,      0,         0,               0.987
  4864,     76,      0,         1,               0.987
  4864,      0,     44,         0,                0.92
  4864,      0,     44,         1,               0.921
  4864,      0,     76,         0,               0.933
  4864,      0,     76,         1,               0.932
  4864,     44,     44,         0,               1.006
  4864,     44,     44,         1,               1.004
  4864,     76,     76,         0,               0.976
  4864,     76,     76,         1,               0.975
  4864,   2048,      0,         0,               0.999
  4864,   2048,      0,         1,               0.999
  4864,   2092,      0,         0,               1.004
  4864,   2092,      0,         1,               1.005
  4864,   2048,     44,         0,               0.907
  4864,   2048,     44,         1,               0.907
  4864,   2092,     44,         0,               1.006
  4864,   2092,     44,         1,               1.005
  4864,     44,      1,         0,               1.034
  4864,     44,      1,         1,               1.032
  4864,      1,     44,         0,               0.908
  4864,      1,     44,         1,               0.929
  4864,     76,      1,         0,               1.006
  4864,     76,      1,         1,               1.005
  4864,      1,     76,         0,               0.798
  4864,      1,     76,         1,               0.798
  4864,   2092,      1,         0,               1.033
  4864,   2092,      1,         1,               1.033
  4864,   2049,     44,         0,               0.904
  4864,   2049,     44,         1,               0.925
  4928,      0,      0,         0,               1.005
  4928,      0,      0,         1,               1.005
  4928,     45,      0,         0,               0.993
  4928,     45,      0,         1,               1.012
  4928,     77,      0,         0,               0.956
  4928,     77,      0,         1,               0.976
  4928,      0,     45,         0,               0.933
  4928,      0,     45,         1,               0.932
  4928,      0,     77,         0,               0.771
  4928,      0,     77,         1,               0.771
  4928,     45,     45,         0,               1.015
  4928,     45,     45,         1,               1.015
  4928,     77,     77,         0,               0.972
  4928,     77,     77,         1,               0.972
  4928,   2048,      0,         0,               1.005
  4928,   2048,      0,         1,               1.005
  4928,   2093,      0,         0,               0.992
  4928,   2093,      0,         1,               1.012
  4928,   2048,     45,         0,               0.932
  4928,   2048,     45,         1,               0.931
  4928,   2093,     45,         0,               1.015
  4928,   2093,     45,         1,               1.015
  4928,     45,      1,         0,               1.009
  4928,     45,      1,         1,               1.032
  4928,      1,     45,         0,               0.806
  4928,      1,     45,         1,               0.805
  4928,     77,      1,         0,               0.981
  4928,     77,      1,         1,               1.005
  4928,      1,     77,         0,               0.917
  4928,      1,     77,         1,               0.917
  4928,   2093,      1,         0,               1.008
  4928,   2093,      1,         1,               1.032
  4928,   2049,     45,         0,               0.794
  4928,   2049,     45,         1,               0.794
  4992,      0,      0,         0,               0.999
  4992,      0,      0,         1,               0.999
  4992,     46,      0,         0,               0.985
  4992,     46,      0,         1,               1.008
  4992,     78,      0,         0,               0.963
  4992,     78,      0,         1,               0.984
  4992,      0,     46,         0,               0.908
  4992,      0,     46,         1,               0.908
  4992,      0,     78,         0,               0.752
  4992,      0,     78,         1,               0.751
  4992,     46,     46,         0,               0.997
  4992,     46,     46,         1,               0.997
  4992,     78,     78,         0,               0.969
  4992,     78,     78,         1,               0.968
  4992,   2048,      0,         0,                 1.0
  4992,   2048,      0,         1,                 1.0
  4992,   2094,      0,         0,               0.987
  4992,   2094,      0,         1,               1.008
  4992,   2048,     46,         0,               0.883
  4992,   2048,     46,         1,               0.883
  4992,   2094,     46,         0,               0.997
  4992,   2094,     46,         1,               0.997
  4992,     46,      1,         0,               0.998
  4992,     46,      1,         1,                1.02
  4992,      1,     46,         0,               0.917
  4992,      1,     46,         1,               0.917
  4992,     78,      1,         0,               0.972
  4992,     78,      1,         1,               0.993
  4992,      1,     78,         0,               0.919
  4992,      1,     78,         1,                0.92
  4992,   2094,      1,         0,               0.997
  4992,   2094,      1,         1,               1.019
  4992,   2049,     46,         0,               0.914
  4992,   2049,     46,         1,               0.914
  5056,      0,      0,         0,               1.002
  5056,      0,      0,         1,                 1.0
  5056,     47,      0,         0,               1.005
  5056,     47,      0,         1,               1.005
  5056,     79,      0,         0,               0.989
  5056,     79,      0,         1,               0.989
  5056,      0,     47,         0,               0.918
  5056,      0,     47,         1,               0.919
  5056,      0,     79,         0,               0.772
  5056,      0,     79,         1,               0.771
  5056,     47,     47,         0,               1.006
  5056,     47,     47,         1,               1.006
  5056,     79,     79,         0,               0.972
  5056,     79,     79,         1,               0.972
  5056,   2048,      0,         0,               1.001
  5056,   2048,      0,         1,                 1.0
  5056,   2095,      0,         0,               1.004
  5056,   2095,      0,         1,               1.004
  5056,   2048,     47,         0,               0.908
  5056,   2048,     47,         1,               0.909
  5056,   2095,     47,         0,               1.006
  5056,   2095,     47,         1,               1.006
  5056,     47,      1,         0,               1.033
  5056,     47,      1,         1,               1.033
  5056,      1,     47,         0,               0.919
  5056,      1,     47,         1,               0.919
  5056,     79,      1,         0,               1.003
  5056,     79,      1,         1,               1.005
  5056,      1,     79,         0,               0.921
  5056,      1,     79,         1,               0.921
  5056,   2095,      1,         0,               1.032
  5056,   2095,      1,         1,               1.034
  5056,   2049,     47,         0,               0.918
  5056,   2049,     47,         1,               0.917
  5120,      0,      0,         0,               1.003
  5120,      0,      0,         1,               1.003
  5120,     48,      0,         0,               1.068
  5120,     48,      0,         1,               1.068
  5120,     80,      0,         0,               1.068
  5120,     80,      0,         1,               1.068
  5120,      0,     48,         0,               1.065
  5120,      0,     48,         1,               1.065
  5120,      0,     80,         0,               1.064
  5120,      0,     80,         1,               1.065
  5120,     48,     48,         0,               1.004
  5120,     48,     48,         1,               1.004
  5120,     80,     80,         0,               1.005
  5120,     80,     80,         1,               1.005
  5120,   2048,      0,         0,               1.005
  5120,   2048,      0,         1,               1.005
  5120,   2096,      0,         0,               1.068
  5120,   2096,      0,         1,               1.068
  5120,   2048,     48,         0,               1.065
  5120,   2048,     48,         1,               1.065
  5120,   2096,     48,         0,               1.005
  5120,   2096,     48,         1,               1.005
  5120,     48,      1,         0,               1.033
  5120,     48,      1,         1,               1.031
  5120,      1,     48,         0,               0.898
  5120,      1,     48,         1,               0.898
  5120,     80,      1,         0,               0.844
  5120,     80,      1,         1,               0.844
  5120,      1,     80,         0,               0.898
  5120,      1,     80,         1,               0.898
  5120,   2096,      1,         0,               0.856
  5120,   2096,      1,         1,               0.855
  5120,   2049,     48,         0,               0.898
  5120,   2049,     48,         1,               0.898

bench-memcpy-random:

 length, New Time / Old Time
  32768,               0.866
  65536,               0.891
 131072,               0.896
 262144,               0.901
 524288,               0.904
1048576,               0.913

bench-memcpy-large:

  length, align0, align1, dst>src, New Time/Old Time
   65543,      0,      0,       0,             0.981
   65543,      0,      0,       1,             0.981
   65551,      0,      3,       0,             1.012
   65551,      0,      3,       1,             1.013
   65567,      3,      0,       0,             1.019
   65567,      3,      0,       1,              1.02
   65599,      3,      5,       0,             1.058
   65599,      3,      5,       1,             1.061
   65536,      0,    127,       0,             1.046
   65536,      0,    127,       1,             1.046
   65536,      0,    255,       0,             1.071
   65536,      0,    255,       1,             1.071
   65536,      0,    256,       0,             0.983
   65536,      0,    256,       1,             0.984
   65536,      0,   4064,       0,             1.017
   65536,      0,   4064,       1,             1.018
  131079,      0,      0,       0,             0.981
  131079,      0,      0,       1,             0.981
  131087,      0,      3,       0,             1.017
  131087,      0,      3,       1,             1.017
  131103,      3,      0,       0,             1.022
  131103,      3,      0,       1,             1.022
  131135,      3,      5,       0,             1.064
  131135,      3,      5,       1,             1.065
  131072,      0,    127,       0,              1.05
  131072,      0,    127,       1,              1.05
  131072,      0,    255,       0,             1.074
  131072,      0,    255,       1,             1.074
  131072,      0,    256,       0,             0.984
  131072,      0,    256,       1,             0.984
  131072,      0,   4064,       0,             1.018
  131072,      0,   4064,       1,             1.019
  262151,      0,      0,       0,             0.985
  262151,      0,      0,       1,             0.985
  262159,      0,      3,       0,             1.026
  262159,      0,      3,       1,             1.026
  262175,      3,      0,       0,              1.03
  262175,      3,      0,       1,              1.03
  262207,      3,      5,       0,              1.07
  262207,      3,      5,       1,              1.07
  262144,      0,    127,       0,             1.057
  262144,      0,    127,       1,             1.057
  262144,      0,    255,       0,             1.079
  262144,      0,    255,       1,             1.078
  262144,      0,    256,       0,             0.988
  262144,      0,    256,       1,             0.988
  262144,      0,   4064,       0,              1.02
  262144,      0,   4064,       1,              1.02
  524295,      0,      0,       0,             0.692
  524295,      0,      0,       1,             0.692
  524303,      0,      3,       0,             0.736
  524303,      0,      3,       1,             0.737
  524319,      3,      0,       0,             0.758
  524319,      3,      0,       1,             0.759
  524351,      3,      5,       0,             0.759
  524351,      3,      5,       1,             0.759
  524288,      0,    127,       0,             1.057
  524288,      0,    127,       1,             1.058
  524288,      0,    255,       0,             1.079
  524288,      0,    255,       1,             1.079
  524288,      0,    256,       0,             0.988
  524288,      0,    256,       1,             0.988
  524288,      0,   4064,       0,              1.02
  524288,      0,   4064,       1,              1.02
 1048583,      0,      0,       0,             0.948
 1048583,      0,      0,       1,             0.948
 1048591,      0,      3,       0,             0.735
 1048591,      0,      3,       1,             0.735
 1048607,      3,      0,       0,             0.757
 1048607,      3,      0,       1,             0.758
 1048639,      3,      5,       0,             0.758
 1048639,      3,      5,       1,             0.758
 1048576,      0,    127,       0,             0.761
 1048576,      0,    127,       1,             0.762
 1048576,      0,    255,       0,             0.751
 1048576,      0,    255,       1,             0.751
 1048576,      0,    256,       0,              0.93
 1048576,      0,    256,       1,              0.93
 1048576,      0,   4064,       0,              0.93
 1048576,      0,   4064,       1,              0.93
 2097159,      0,      0,       0,             0.928
 2097159,      0,      0,       1,             0.931
 2097167,      0,      3,       0,             0.735
 2097167,      0,      3,       1,             0.734
 2097183,      3,      0,       0,             0.759
 2097183,      3,      0,       1,             0.759
 2097215,      3,      5,       0,             0.758
 2097215,      3,      5,       1,             0.757
 2097152,      0,    127,       0,              0.77
 2097152,      0,    127,       1,              0.77
 2097152,      0,    255,       0,             0.745
 2097152,      0,    255,       1,             0.745
 2097152,      0,    256,       0,             0.924
 2097152,      0,    256,       1,             0.925
 2097152,      0,   4064,       0,             0.926
 2097152,      0,   4064,       1,             0.927
 4194311,      0,      0,       0,             0.894
 4194311,      0,      0,       1,             0.896
 4194319,      0,      3,       0,             0.752
 4194319,      0,      3,       1,             0.751
 4194335,      3,      0,       0,              0.82
 4194335,      3,      0,       1,             0.821
 4194367,      3,      5,       0,             0.788
 4194367,      3,      5,       1,             0.789
 4194304,      0,    127,       0,             0.801
 4194304,      0,    127,       1,             0.801
 4194304,      0,    255,       0,             0.802
 4194304,      0,    255,       1,             0.804
 4194304,      0,    256,       0,             0.873
 4194304,      0,    256,       1,             0.868
 4194304,      0,   4064,       0,             0.955
 4194304,      0,   4064,       1,             0.954
 8388615,      0,      0,       0,             0.885
 8388615,      0,      0,       1,             0.886
 8388623,      0,      3,       0,             0.769
 8388623,      0,      3,       1,             0.769
 8388639,      3,      0,       0,              0.87
 8388639,      3,      0,       1,              0.87
 8388671,      3,      5,       0,             0.811
 8388671,      3,      5,       1,             0.814
 8388608,      0,    127,       0,              0.83
 8388608,      0,    127,       1,              0.83
 8388608,      0,    255,       0,             0.857
 8388608,      0,    255,       1,             0.857
 8388608,      0,    256,       0,             0.851
 8388608,      0,    256,       1,             0.848
 8388608,      0,   4064,       0,             0.981
 8388608,      0,   4064,       1,             0.981
16777223,      0,      0,       0,             0.885
16777223,      0,      0,       1,             0.886
16777231,      0,      3,       0,             0.769
16777231,      0,      3,       1,             0.768
16777247,      3,      0,       0,              0.87
16777247,      3,      0,       1,              0.87
16777279,      3,      5,       0,             0.811
16777279,      3,      5,       1,             0.814
16777216,      0,    127,       0,             0.831
16777216,      0,    127,       1,              0.83
16777216,      0,    255,       0,             0.857
16777216,      0,    255,       1,             0.857
16777216,      0,    256,       0,             0.852
16777216,      0,    256,       1,             0.848
16777216,      0,   4064,       0,              0.98
16777216,      0,   4064,       1,             0.981
33554439,      0,      0,       0,             0.885
33554439,      0,      0,       1,             0.886
33554447,      0,      3,       0,             0.768
33554447,      0,      3,       1,             0.768
33554463,      3,      0,       0,             0.871
33554463,      3,      0,       1,              0.87
33554495,      3,      5,       0,             0.811
33554495,      3,      5,       1,             0.814
33554432,      0,    127,       0,             0.831
33554432,      0,    127,       1,             0.831
33554432,      0,    255,       0,             0.858
33554432,      0,    255,       1,             0.857
33554432,      0,    256,       0,             0.852
33554432,      0,    256,       1,             0.848
33554432,      0,   4064,       0,              0.98
33554432,      0,   4064,       1,             0.981    


 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 1/6] x86: Remove str{p}{n}cpy-ssse3
  2022-04-10  0:42   ` [PATCH v3 1/6] " Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
> Full memcpy ssse3 results. Number are comparison of
> geometric mean of N=50 runs on Zhaoxin KX-6840@2000MHz
>
> bench-memcpy:
>
> length, align1, align2, dst > src, New Time / Old Time
>      1,      0,      0,         0,               2.099
>      1,      0,      0,         1,               2.099
>      1,     32,      0,         0,               2.103
>      1,     32,      0,         1,               2.103
>      1,      0,     32,         0,               2.099
>      1,      0,     32,         1,               2.098
>      1,     32,     32,         0,               2.098
>      1,     32,     32,         1,               2.098
>      1,   2048,      0,         0,               2.098
>      1,   2048,      0,         1,               2.098
>      2,      0,      0,         0,               1.135
>      2,      0,      0,         1,               1.136
>      2,      1,      0,         0,               1.139
>      2,      1,      0,         1,               1.139
>      2,     33,      0,         0,               1.165
>      2,     33,      0,         1,               1.139
>      2,      0,      1,         0,               1.136
>      2,      0,      1,         1,               1.136
>      2,      0,     33,         0,               1.136
>      2,      0,     33,         1,               1.136
>      2,      1,      1,         0,               1.136
>      2,      1,      1,         1,               1.136
>      2,     33,     33,         0,               1.136
>      2,     33,     33,         1,               1.136
>      2,   2048,      0,         0,               1.136
>      2,   2048,      0,         1,               1.136
>      2,   2049,      0,         0,               1.191
>      2,   2049,      0,         1,               1.139
>      2,   2048,      1,         0,               1.136
>      2,   2048,      1,         1,               1.136
>      2,   2049,      1,         0,               1.136
>      2,   2049,      1,         1,               1.136
>      4,      0,      0,         0,               1.074
>      4,      0,      0,         1,               0.962
>      4,      2,      0,         0,               0.973
>      4,      2,      0,         1,               0.989
>      4,     34,      0,         0,               0.991
>      4,     34,      0,         1,               0.991
>      4,      0,      2,         0,               0.962
>      4,      0,      2,         1,               0.962
>      4,      0,     34,         0,               0.962
>      4,      0,     34,         1,               0.962
>      4,      2,      2,         0,               0.962
>      4,      2,      2,         1,               0.962
>      4,     34,     34,         0,               0.962
>      4,     34,     34,         1,               0.962
>      4,   2048,      0,         0,               0.962
>      4,   2048,      0,         1,               0.962
>      4,   2050,      0,         0,               0.977
>      4,   2050,      0,         1,               0.979
>      4,   2048,      2,         0,               0.962
>      4,   2048,      2,         1,               0.962
>      4,   2050,      2,         0,               0.962
>      4,   2050,      2,         1,               0.962
>      8,      0,      0,         0,               0.961
>      8,      0,      0,         1,               0.962
>      8,      3,      0,         0,                 1.0
>      8,      3,      0,         1,                 1.0
>      8,     35,      0,         0,                 1.0
>      8,     35,      0,         1,                 1.0
>      8,      0,      3,         0,               0.962
>      8,      0,      3,         1,               0.962
>      8,      0,     35,         0,               0.962
>      8,      0,     35,         1,               0.962
>      8,      3,      3,         0,               0.962
>      8,      3,      3,         1,               0.962
>      8,     35,     35,         0,               0.962
>      8,     35,     35,         1,               0.962
>      8,   2048,      0,         0,               0.962
>      8,   2048,      0,         1,               0.962
>      8,   2051,      0,         0,                 1.0
>      8,   2051,      0,         1,                 1.0
>      8,   2048,      3,         0,               0.962
>      8,   2048,      3,         1,               0.962
>      8,   2051,      3,         0,               0.962
>      8,   2051,      3,         1,               0.962
>     16,      0,      0,         0,               0.798
>     16,      0,      0,         1,               0.799
>     16,      4,      0,         0,                 0.8
>     16,      4,      0,         1,               0.801
>     16,     36,      0,         0,               0.801
>     16,     36,      0,         1,                 0.8
>     16,      0,      4,         0,               0.798
>     16,      0,      4,         1,               0.798
>     16,      0,     36,         0,               0.798
>     16,      0,     36,         1,               0.798
>     16,      4,      4,         0,               0.798
>     16,      4,      4,         1,               0.798
>     16,     36,     36,         0,               0.798
>     16,     36,     36,         1,               0.798
>     16,   2048,      0,         0,               0.798
>     16,   2048,      0,         1,               0.799
>     16,   2052,      0,         0,                 0.8
>     16,   2052,      0,         1,                 0.8
>     16,   2048,      4,         0,               0.798
>     16,   2048,      4,         1,               0.798
>     16,   2052,      4,         0,               0.798
>     16,   2052,      4,         1,               0.798
>     32,      0,      0,         0,               0.471
>     32,      0,      0,         1,               0.471
>     32,      5,      0,         0,               0.471
>     32,      5,      0,         1,               0.471
>     32,     37,      0,         0,               0.961
>     32,     37,      0,         1,               0.961
>     32,      0,      5,         0,               0.471
>     32,      0,      5,         1,               0.471
>     32,      0,     37,         0,               1.021
>     32,      0,     37,         1,               1.021
>     32,      5,      5,         0,               0.471
>     32,      5,      5,         1,               0.471
>     32,     37,     37,         0,               1.011
>     32,     37,     37,         1,               1.011
>     32,   2048,      0,         0,               0.471
>     32,   2048,      0,         1,               0.471
>     32,   2053,      0,         0,               0.471
>     32,   2053,      0,         1,               0.471
>     32,   2048,      5,         0,               0.471
>     32,   2048,      5,         1,               0.471
>     32,   2053,      5,         0,               0.471
>     32,   2053,      5,         1,               0.471
>     64,      0,      0,         0,                 1.0
>     64,      0,      0,         1,                 1.0
>     64,      6,      0,         0,               0.862
>     64,      6,      0,         1,               0.862
>     64,     38,      0,         0,               0.912
>     64,     38,      0,         1,               0.912
>     64,      0,      6,         0,               0.896
>     64,      0,      6,         1,               0.896
>     64,      0,     38,         0,               0.906
>     64,      0,     38,         1,               0.906
>     64,      6,      6,         0,                0.91
>     64,      6,      6,         1,                0.91
>     64,     38,     38,         0,               0.883
>     64,     38,     38,         1,               0.883
>     64,   2048,      0,         0,                 1.0
>     64,   2048,      0,         1,                 1.0
>     64,   2054,      0,         0,               0.862
>     64,   2054,      0,         1,               0.862
>     64,   2048,      6,         0,               0.887
>     64,   2048,      6,         1,               0.887
>     64,   2054,      6,         0,               0.887
>     64,   2054,      6,         1,               0.887
>    128,      0,      0,         0,               0.857
>    128,      0,      0,         1,               0.857
>    128,      7,      0,         0,               0.875
>    128,      7,      0,         1,               0.875
>    128,     39,      0,         0,               0.892
>    128,     39,      0,         1,               0.892
>    128,      0,      7,         0,               1.183
>    128,      0,      7,         1,               1.183
>    128,      0,     39,         0,               1.113
>    128,      0,     39,         1,               1.113
>    128,      7,      7,         0,               0.692
>    128,      7,      7,         1,               0.692
>    128,     39,     39,         0,               1.104
>    128,     39,     39,         1,               1.104
>    128,   2048,      0,         0,               0.857
>    128,   2048,      0,         1,               0.857
>    128,   2055,      0,         0,               0.875
>    128,   2055,      0,         1,               0.875
>    128,   2048,      7,         0,               0.959
>    128,   2048,      7,         1,               0.959
>    128,   2055,      7,         0,               1.036
>    128,   2055,      7,         1,               1.036
>    256,      0,      0,         0,               0.889
>    256,      0,      0,         1,               0.889
>    256,      8,      0,         0,               0.966
>    256,      8,      0,         1,               0.966
>    256,     40,      0,         0,               0.983
>    256,     40,      0,         1,               0.983
>    256,      0,      8,         0,                1.29
>    256,      0,      8,         1,                1.29
>    256,      0,     40,         0,               1.274
>    256,      0,     40,         1,               1.274
>    256,      8,      8,         0,               0.865
>    256,      8,      8,         1,               0.865
>    256,     40,     40,         0,               1.477
>    256,     40,     40,         1,               1.477
>    256,   2048,      0,         0,               0.889
>    256,   2048,      0,         1,               0.889
>    256,   2056,      0,         0,               0.966
>    256,   2056,      0,         1,               0.966
>    256,   2048,      8,         0,               0.952
>    256,   2048,      8,         1,               0.952
>    256,   2056,      8,         0,               0.878
>    256,   2056,      8,         1,               0.878
>    512,      0,      0,         0,               1.077
>    512,      0,      0,         1,               1.077
>    512,      9,      0,         0,               1.001
>    512,      9,      0,         1,                 1.0
>    512,     41,      0,         0,               0.954
>    512,     41,      0,         1,               0.954
>    512,      0,      9,         0,               1.191
>    512,      0,      9,         1,               1.191
>    512,      0,     41,         0,               1.181
>    512,      0,     41,         1,               1.181
>    512,      9,      9,         0,               0.765
>    512,      9,      9,         1,               0.765
>    512,     41,     41,         0,               0.905
>    512,     41,     41,         1,               0.905
>    512,   2048,      0,         0,               1.077
>    512,   2048,      0,         1,               1.077
>    512,   2057,      0,         0,                 1.0
>    512,   2057,      0,         1,                 1.0
>    512,   2048,      9,         0,                 1.0
>    512,   2048,      9,         1,                 1.0
>    512,   2057,      9,         0,               0.733
>    512,   2057,      9,         1,               0.733
>   1024,      0,      0,         0,               1.143
>   1024,      0,      0,         1,               1.143
>   1024,     10,      0,         0,               1.015
>   1024,     10,      0,         1,               1.015
>   1024,     42,      0,         0,               1.045
>   1024,     42,      0,         1,               1.045
>   1024,      0,     10,         0,               1.126
>   1024,      0,     10,         1,               1.126
>   1024,      0,     42,         0,               1.114
>   1024,      0,     42,         1,               1.114
>   1024,     10,     10,         0,                0.89
>   1024,     10,     10,         1,                0.89
>   1024,     42,     42,         0,               0.986
>   1024,     42,     42,         1,               0.986
>   1024,   2048,      0,         0,               1.143
>   1024,   2048,      0,         1,               1.143
>   1024,   2058,      0,         0,               1.015
>   1024,   2058,      0,         1,               1.015
>   1024,   2048,     10,         0,                1.03
>   1024,   2048,     10,         1,                1.03
>   1024,   2058,     10,         0,               0.854
>   1024,   2058,     10,         1,               0.854
>   2048,      0,      0,         0,               1.005
>   2048,      0,      0,         1,               1.005
>   2048,     11,      0,         0,               1.013
>   2048,     11,      0,         1,               1.014
>   2048,     43,      0,         0,               1.044
>   2048,     43,      0,         1,               1.044
>   2048,      0,     11,         0,               1.003
>   2048,      0,     11,         1,               1.003
>   2048,      0,     43,         0,               1.003
>   2048,      0,     43,         1,               1.003
>   2048,     11,     11,         0,                0.92
>   2048,     11,     11,         1,                0.92
>   2048,     43,     43,         0,                 1.0
>   2048,     43,     43,         1,                 1.0
>   2048,   2048,      0,         0,               1.005
>   2048,   2048,      0,         1,               1.005
>   2048,   2059,      0,         0,               0.904
>   2048,   2059,      0,         1,               0.904
>   2048,   2048,     11,         0,                 1.0
>   2048,   2048,     11,         1,                 1.0
>   2048,   2059,     11,         0,               0.979
>   2048,   2059,     11,         1,               0.979
>   4096,      0,      0,         0,               1.014
>   4096,      0,      0,         1,               1.014
>   4096,     12,      0,         0,               0.855
>   4096,     12,      0,         1,               0.855
>   4096,     44,      0,         0,               0.857
>   4096,     44,      0,         1,               0.857
>   4096,      0,     12,         0,               0.932
>   4096,      0,     12,         1,               0.932
>   4096,      0,     44,         0,               0.932
>   4096,      0,     44,         1,               0.932
>   4096,     12,     12,         0,               0.999
>   4096,     12,     12,         1,               0.999
>   4096,     44,     44,         0,               1.051
>   4096,     44,     44,         1,               1.051
>   4096,   2048,      0,         0,               1.014
>   4096,   2048,      0,         1,               1.014
>   4096,   2060,      0,         0,                0.98
>   4096,   2060,      0,         1,                0.98
>   4096,   2048,     12,         0,                0.77
>   4096,   2048,     12,         1,                0.77
>   4096,   2060,     12,         0,               0.943
>   4096,   2060,     12,         1,               0.943
>   8192,      0,      0,         0,               1.046
>   8192,      0,      0,         1,               1.046
>   8192,     13,      0,         0,               0.885
>   8192,     13,      0,         1,               0.885
>   8192,     45,      0,         0,               0.887
>   8192,     45,      0,         1,               0.886
>   8192,      0,     13,         0,               0.942
>   8192,      0,     13,         1,               0.942
>   8192,      0,     45,         0,               0.942
>   8192,      0,     45,         1,               0.942
>   8192,     13,     13,         0,                1.03
>   8192,     13,     13,         1,                1.03
>   8192,     45,     45,         0,               1.048
>   8192,     45,     45,         1,               1.048
>   8192,   2048,      0,         0,               1.048
>   8192,   2048,      0,         1,               1.048
>   8192,   2061,      0,         0,               1.011
>   8192,   2061,      0,         1,               1.011
>   8192,   2048,     13,         0,               0.789
>   8192,   2048,     13,         1,               0.789
>   8192,   2061,     13,         0,               0.991
>   8192,   2061,     13,         1,               0.991
>  16384,      0,      0,         0,               1.014
>  16384,      0,      0,         1,               1.008
>  16384,     14,      0,         0,               0.951
>  16384,     14,      0,         1,                0.95
>  16384,     46,      0,         0,               0.874
>  16384,     46,      0,         1,               0.871
>  16384,      0,     14,         0,               0.813
>  16384,      0,     14,         1,                0.81
>  16384,      0,     46,         0,                0.85
>  16384,      0,     46,         1,                0.86
>  16384,     14,     14,         0,               0.985
>  16384,     14,     14,         1,               0.975
>  16384,     46,     46,         0,               1.025
>  16384,     46,     46,         1,               1.027
>  16384,   2048,      0,         0,               1.058
>  16384,   2048,      0,         1,               1.058
>  16384,   2062,      0,         0,               0.849
>  16384,   2062,      0,         1,               0.848
>  16384,   2048,     14,         0,               0.907
>  16384,   2048,     14,         1,               0.907
>  16384,   2062,     14,         0,               0.988
>  16384,   2062,     14,         1,               0.995
>  32768,      0,      0,         0,               0.979
>  32768,      0,      0,         1,               0.979
>  32768,     15,      0,         0,               1.006
>  32768,     15,      0,         1,               1.006
>  32768,     47,      0,         0,               1.004
>  32768,     47,      0,         1,               1.004
>  32768,      0,     15,         0,               1.045
>  32768,      0,     15,         1,               1.045
>  32768,      0,     47,         0,               1.011
>  32768,      0,     47,         1,               1.012
>  32768,     15,     15,         0,               0.977
>  32768,     15,     15,         1,               0.977
>  32768,     47,     47,         0,                0.96
>  32768,     47,     47,         1,                0.96
>  32768,   2048,      0,         0,               0.978
>  32768,   2048,      0,         1,               0.978
>  32768,   2063,      0,         0,               1.004
>  32768,   2063,      0,         1,               1.004
>  32768,   2048,     15,         0,               1.036
>  32768,   2048,     15,         1,               1.036
>  32768,   2063,     15,         0,               0.978
>  32768,   2063,     15,         1,               0.978
>  65536,      0,      0,         0,               0.981
>  65536,      0,      0,         1,               0.981
>  65536,     16,      0,         0,               0.987
>  65536,     16,      0,         1,               0.987
>  65536,     48,      0,         0,               0.968
>  65536,     48,      0,         1,               0.968
>  65536,      0,     16,         0,               1.014
>  65536,      0,     16,         1,               1.014
>  65536,      0,     48,         0,               0.984
>  65536,      0,     48,         1,               0.984
>  65536,     16,     16,         0,                1.01
>  65536,     16,     16,         1,                1.01
>  65536,     48,     48,         0,               0.968
>  65536,     48,     48,         1,               0.968
>  65536,   2048,      0,         0,               0.982
>  65536,   2048,      0,         1,               0.982
>  65536,   2064,      0,         0,               0.987
>  65536,   2064,      0,         1,               0.987
>  65536,   2048,     16,         0,               1.012
>  65536,   2048,     16,         1,               1.012
>  65536,   2064,     16,         0,               1.007
>  65536,   2064,     16,         1,               1.007
>      0,      0,      0,         0,               2.104
>      0,   2048,      0,         0,               2.104
>      0,   4095,      0,         0,               2.109
>      0,      0,   4095,         0,               2.103
>      1,      1,      0,         0,               2.104
>      1,      0,      1,         0,               2.098
>      1,      1,      1,         0,               2.098
>      1,   2049,      0,         0,               2.102
>      1,   2048,      1,         0,               2.098
>      1,   2049,      1,         0,               2.098
>      1,   4095,      0,         0,               2.103
>      1,      0,   4095,         0,               2.098
>      2,      2,      0,         0,               1.139
>      2,      0,      2,         0,               1.136
>      2,      2,      2,         0,               1.136
>      2,   2050,      0,         0,               1.139
>      2,   2048,      2,         0,               1.136
>      2,   2050,      2,         0,               1.136
>      2,   4095,      0,         0,                 1.0
>      2,      0,   4095,         0,               1.022
>      3,      0,      0,         0,               0.981
>      3,      3,      0,         0,               0.984
>      3,      0,      3,         0,               0.982
>      3,      3,      3,         0,               0.982
>      3,   2048,      0,         0,               0.982
>      3,   2051,      0,         0,               0.983
>      3,   2048,      3,         0,               0.982
>      3,   2051,      3,         0,               0.982
>      3,   4095,      0,         0,               0.285
>      3,      0,   4095,         0,               0.231
>      4,      4,      0,         0,               1.373
>      4,      0,      4,         0,                1.31
>      4,      4,      4,         0,               1.282
>      4,   2052,      0,         0,               1.264
>      4,   2048,      4,         0,               1.254
>      4,   2052,      4,         0,               1.254
>      4,   4095,      0,         0,               1.971
>      4,      0,   4095,         0,               1.994
>      5,      0,      0,         0,               1.145
>      5,      5,      0,         0,               1.155
>      5,      0,      5,         0,               1.171
>      5,      5,      5,         0,               1.171
>      5,   2048,      0,         0,               1.197
>      5,   2053,      0,         0,               1.173
>      5,   2048,      5,         0,               1.171
>      5,   2053,      5,         0,               1.171
>      5,   4095,      0,         0,               0.935
>      5,      0,   4095,         0,               1.017
>      6,      0,      0,         0,               1.145
>      6,      6,      0,         0,               1.098
>      6,      0,      6,         0,               1.096
>      6,      6,      6,         0,               1.096
>      6,   2048,      0,         0,                1.12
>      6,   2054,      0,         0,               1.122
>      6,   2048,      6,         0,                1.12
>      6,   2054,      6,         0,               1.096
>      6,   4095,      0,         0,               0.935
>      6,      0,   4095,         0,               1.018
>      7,      0,      0,         0,               1.071
>      7,      7,      0,         0,               1.074
>      7,      0,      7,         0,               1.072
>      7,      7,      7,         0,               1.072
>      7,   2048,      0,         0,               1.096
>      7,   2055,      0,         0,               1.098
>      7,   2048,      7,         0,               1.096
>      7,   2055,      7,         0,               1.096
>      7,   4095,      0,         0,               0.935
>      7,      0,   4095,         0,               1.016
>      8,      8,      0,         0,               1.167
>      8,      0,      8,         0,               1.028
>      8,      8,      8,         0,               1.028
>      8,   2056,      0,         0,               1.069
>      8,   2048,      8,         0,               1.028
>      8,   2056,      8,         0,               1.028
>      8,   4095,      0,         0,               1.029
>      8,      0,   4095,         0,               1.043
>      9,      0,      0,         0,               0.799
>      9,      9,      0,         0,               0.801
>      9,      0,      9,         0,               0.799
>      9,      9,      9,         0,               0.799
>      9,   2048,      0,         0,                 0.8
>      9,   2057,      0,         0,               0.801
>      9,   2048,      9,         0,                 0.8
>      9,   2057,      9,         0,               0.799
>      9,   4095,      0,         0,               0.909
>      9,      0,   4095,         0,                 1.0
>     10,      0,      0,         0,               0.799
>     10,     10,      0,         0,               0.801
>     10,      0,     10,         0,                 0.8
>     10,     10,     10,         0,                 0.8
>     10,   2048,      0,         0,                 0.8
>     10,   2058,      0,         0,               0.801
>     10,   2048,     10,         0,                 0.8
>     10,   2058,     10,         0,                 0.8
>     10,   4095,      0,         0,               0.909
>     10,      0,   4095,         0,                 1.0
>     11,      0,      0,         0,               0.799
>     11,     11,      0,         0,               0.801
>     11,      0,     11,         0,                 0.8
>     11,     11,     11,         0,                 0.8
>     11,   2048,      0,         0,                 0.8
>     11,   2059,      0,         0,               0.802
>     11,   2048,     11,         0,                 0.8
>     11,   2059,     11,         0,                 0.8
>     11,   4095,      0,         0,               0.909
>     11,      0,   4095,         0,                 1.0
>     12,      0,      0,         0,               0.799
>     12,     12,      0,         0,               0.801
>     12,      0,     12,         0,                 0.8
>     12,     12,     12,         0,                 0.8
>     12,   2048,      0,         0,                 0.8
>     12,   2060,      0,         0,               0.802
>     12,   2048,     12,         0,                 0.8
>     12,   2060,     12,         0,                 0.8
>     12,   4095,      0,         0,               0.909
>     12,      0,   4095,         0,                 1.0
>     13,      0,      0,         0,               0.798
>     13,     13,      0,         0,               0.801
>     13,      0,     13,         0,               0.799
>     13,     13,     13,         0,               0.799
>     13,   2048,      0,         0,                 0.8
>     13,   2061,      0,         0,               0.801
>     13,   2048,     13,         0,                 0.8
>     13,   2061,     13,         0,                 0.8
>     13,   4095,      0,         0,               0.909
>     13,      0,   4095,         0,                 1.0
>     14,      0,      0,         0,               0.799
>     14,     14,      0,         0,               0.801
>     14,      0,     14,         0,                 0.8
>     14,     14,     14,         0,                 0.8
>     14,   2048,      0,         0,                 0.8
>     14,   2062,      0,         0,               0.801
>     14,   2048,     14,         0,                 0.8
>     14,   2062,     14,         0,                 0.8
>     14,   4095,      0,         0,               0.909
>     14,      0,   4095,         0,                 1.0
>     15,      0,      0,         0,               0.799
>     15,     15,      0,         0,               0.801
>     15,      0,     15,         0,                 0.8
>     15,     15,     15,         0,                 0.8
>     15,   2048,      0,         0,                 0.8
>     15,   2063,      0,         0,               0.802
>     15,   2048,     15,         0,                 0.8
>     15,   2063,     15,         0,                 0.8
>     15,   4095,      0,         0,               0.909
>     15,      0,   4095,         0,                 1.0
>     16,     16,      0,         0,               0.801
>     16,      0,     16,         0,               0.799
>     16,     16,     16,         0,               0.799
>     16,   2064,      0,         0,               0.801
>     16,   2048,     16,         0,               0.798
>     16,   2064,     16,         0,               0.798
>     16,   4095,      0,         0,               1.818
>     16,      0,   4095,         0,               1.957
>     17,      0,      0,         0,               0.798
>     17,     17,      0,         0,                 0.8
>     17,      0,     17,         0,               0.799
>     17,     17,     17,         0,               0.798
>     17,   2048,      0,         0,               0.798
>     17,   2065,      0,         0,                 0.8
>     17,   2048,     17,         0,               0.798
>     17,   2065,     17,         0,               0.799
>     17,   4095,      0,         0,               0.937
>     17,      0,   4095,         0,               1.021
>     18,      0,      0,         0,               0.798
>     18,     18,      0,         0,               0.801
>     18,      0,     18,         0,               0.798
>     18,     18,     18,         0,               0.798
>     18,   2048,      0,         0,               0.799
>     18,   2066,      0,         0,                 0.8
>     18,   2048,     18,         0,               0.798
>     18,   2066,     18,         0,               0.798
>     18,   4095,      0,         0,               0.937
>     18,      0,   4095,         0,               1.021
>     19,      0,      0,         0,               0.798
>     19,     19,      0,         0,                 0.8
>     19,      0,     19,         0,               0.798
>     19,     19,     19,         0,               0.798
>     19,   2048,      0,         0,               0.798
>     19,   2067,      0,         0,                 0.8
>     19,   2048,     19,         0,               0.798
>     19,   2067,     19,         0,               0.798
>     19,   4095,      0,         0,               0.937
>     19,      0,   4095,         0,               1.021
>     20,      0,      0,         0,               0.798
>     20,     20,      0,         0,                 0.8
>     20,      0,     20,         0,               0.798
>     20,     20,     20,         0,               0.798
>     20,   2048,      0,         0,               0.798
>     20,   2068,      0,         0,                 0.8
>     20,   2048,     20,         0,               0.798
>     20,   2068,     20,         0,               0.798
>     20,   4095,      0,         0,               0.937
>     20,      0,   4095,         0,               1.021
>     21,      0,      0,         0,               0.798
>     21,     21,      0,         0,               0.801
>     21,      0,     21,         0,               0.798
>     21,     21,     21,         0,               0.798
>     21,   2048,      0,         0,               0.798
>     21,   2069,      0,         0,               0.801
>     21,   2048,     21,         0,               0.799
>     21,   2069,     21,         0,               0.798
>     21,   4095,      0,         0,               0.937
>     21,      0,   4095,         0,               1.021
>     22,      0,      0,         0,               0.798
>     22,     22,      0,         0,               0.801
>     22,      0,     22,         0,               0.798
>     22,     22,     22,         0,               0.798
>     22,   2048,      0,         0,               0.798
>     22,   2070,      0,         0,               0.801
>     22,   2048,     22,         0,               0.798
>     22,   2070,     22,         0,               0.798
>     22,   4095,      0,         0,               0.937
>     22,      0,   4095,         0,               1.021
>     23,      0,      0,         0,               0.798
>     23,     23,      0,         0,                 0.8
>     23,      0,     23,         0,               0.798
>     23,     23,     23,         0,               0.798
>     23,   2048,      0,         0,               0.798
>     23,   2071,      0,         0,                 0.8
>     23,   2048,     23,         0,               0.798
>     23,   2071,     23,         0,               0.798
>     23,   4095,      0,         0,               0.937
>     23,      0,   4095,         0,               1.021
>     24,      0,      0,         0,               0.798
>     24,     24,      0,         0,                 0.8
>     24,      0,     24,         0,               0.799
>     24,     24,     24,         0,               0.798
>     24,   2048,      0,         0,               0.798
>     24,   2072,      0,         0,               0.801
>     24,   2048,     24,         0,               0.798
>     24,   2072,     24,         0,               0.798
>     24,   4095,      0,         0,               0.937
>     24,      0,   4095,         0,               1.021
>     25,      0,      0,         0,                 0.5
>     25,     25,      0,         0,                 0.5
>     25,      0,     25,         0,                 0.5
>     25,     25,     25,         0,                 0.5
>     25,   2048,      0,         0,                 0.5
>     25,   2073,      0,         0,               0.501
>     25,   2048,     25,         0,                 0.5
>     25,   2073,     25,         0,                 0.5
>     25,   4095,      0,         0,               0.974
>     25,      0,   4095,         0,                0.98
>     26,      0,      0,         0,                 0.5
>     26,     26,      0,         0,               0.501
>     26,      0,     26,         0,                 0.5
>     26,     26,     26,         0,               0.501
>     26,   2048,      0,         0,                 0.5
>     26,   2074,      0,         0,                 0.5
>     26,   2048,     26,         0,                 0.5
>     26,   2074,     26,         0,                 0.5
>     26,   4095,      0,         0,               0.974
>     26,      0,   4095,         0,                 1.0
>     27,      0,      0,         0,                 0.5
>     27,     27,      0,         0,               0.501
>     27,      0,     27,         0,                 0.5
>     27,     27,     27,         0,                 0.5
>     27,   2048,      0,         0,                 0.5
>     27,   2075,      0,         0,                 0.5
>     27,   2048,     27,         0,                 0.5
>     27,   2075,     27,         0,                 0.5
>     27,   4095,      0,         0,               0.974
>     27,      0,   4095,         0,                 1.0
>     28,      0,      0,         0,                 0.5
>     28,     28,      0,         0,               0.501
>     28,      0,     28,         0,                 0.5
>     28,     28,     28,         0,                 0.5
>     28,   2048,      0,         0,                 0.5
>     28,   2076,      0,         0,                 0.5
>     28,   2048,     28,         0,                 0.5
>     28,   2076,     28,         0,                 0.5
>     28,   4095,      0,         0,               0.974
>     28,      0,   4095,         0,                 1.0
>     29,      0,      0,         0,               0.471
>     29,     29,      0,         0,               0.471
>     29,      0,     29,         0,               0.471
>     29,     29,     29,         0,               0.471
>     29,   2048,      0,         0,               0.471
>     29,   2077,      0,         0,               0.471
>     29,   2048,     29,         0,               0.471
>     29,   2077,     29,         0,               0.471
>     29,   4095,      0,         0,               0.974
>     29,      0,   4095,         0,                 1.0
>     30,      0,      0,         0,               0.471
>     30,     30,      0,         0,               0.471
>     30,      0,     30,         0,               0.471
>     30,     30,     30,         0,               0.471
>     30,   2048,      0,         0,               0.471
>     30,   2078,      0,         0,               0.471
>     30,   2048,     30,         0,               0.471
>     30,   2078,     30,         0,               0.471
>     30,   4095,      0,         0,               0.974
>     30,      0,   4095,         0,                 1.0
>     31,      0,      0,         0,               0.471
>     31,     31,      0,         0,               0.471
>     31,      0,     31,         0,               0.471
>     31,     31,     31,         0,               0.471
>     31,   2048,      0,         0,               0.471
>     31,   2079,      0,         0,               0.471
>     31,   2048,     31,         0,               0.471
>     31,   2079,     31,         0,               0.471
>     31,   4095,      0,         0,               0.974
>     31,      0,   4095,         0,                 1.0
>     48,      0,      0,         0,                 1.0
>     48,      0,      0,         1,                 1.0
>     48,      3,      0,         0,                 1.0
>     48,      3,      0,         1,                 1.0
>     48,      0,      3,         0,                 1.0
>     48,      0,      3,         1,                 1.0
>     48,      3,      3,         0,                 1.0
>     48,      3,      3,         1,                 1.0
>     48,   2048,      0,         0,                 1.0
>     48,   2048,      0,         1,                 1.0
>     48,   2051,      0,         0,                 1.0
>     48,   2051,      0,         1,                 1.0
>     48,   2048,      3,         0,                 1.0
>     48,   2048,      3,         1,                 1.0
>     48,   2051,      3,         0,                 1.0
>     48,   2051,      3,         1,                 1.0
>     80,      0,      0,         0,               0.781
>     80,      0,      0,         1,               0.782
>     80,      5,      0,         0,               0.976
>     80,      5,      0,         1,               0.976
>     80,      0,      5,         0,               1.232
>     80,      0,      5,         1,               1.232
>     80,      5,      5,         0,               1.542
>     80,      5,      5,         1,               1.543
>     80,   2048,      0,         0,               0.781
>     80,   2048,      0,         1,               0.782
>     80,   2053,      0,         0,               0.976
>     80,   2053,      0,         1,               0.976
>     80,   2048,      5,         0,               1.093
>     80,   2048,      5,         1,               1.093
>     80,   2053,      5,         0,               1.371
>     80,   2053,      5,         1,               1.371
>     96,      0,      0,         0,               0.758
>     96,      0,      0,         1,               0.758
>     96,      6,      0,         0,               0.929
>     96,      6,      0,         1,               0.929
>     96,      0,      6,         0,               1.204
>     96,      0,      6,         1,               1.204
>     96,      6,      6,         0,               1.562
>     96,      6,      6,         1,               1.562
>     96,   2048,      0,         0,               0.758
>     96,   2048,      0,         1,               0.758
>     96,   2054,      0,         0,               0.929
>     96,   2054,      0,         1,               0.929
>     96,   2048,      6,         0,               1.068
>     96,   2048,      6,         1,               1.068
>     96,   2054,      6,         0,               1.562
>     96,   2054,      6,         1,               1.562
>    112,      0,      0,         0,               0.736
>    112,      0,      0,         1,               0.736
>    112,      7,      0,         0,               0.675
>    112,      7,      0,         1,               0.675
>    112,      0,      7,         0,               0.778
>    112,      0,      7,         1,               0.778
>    112,      7,      7,         0,               0.909
>    112,      7,      7,         1,               0.909
>    112,   2048,      0,         0,               0.736
>    112,   2048,      0,         1,               0.736
>    112,   2055,      0,         0,               0.675
>    112,   2055,      0,         1,               0.675
>    112,   2048,      7,         0,               0.778
>    112,   2048,      7,         1,               0.778
>    112,   2055,      7,         0,               0.909
>    112,   2055,      7,         1,               0.909
>    144,      0,      0,         0,               0.857
>    144,      0,      0,         1,               0.857
>    144,      9,      0,         0,               0.941
>    144,      9,      0,         1,               0.943
>    144,      0,      9,         0,               1.137
>    144,      0,      9,         1,               1.137
>    144,      9,      9,         0,               1.514
>    144,      9,      9,         1,               1.514
>    144,   2048,      0,         0,               0.857
>    144,   2048,      0,         1,               0.857
>    144,   2057,      0,         0,               0.939
>    144,   2057,      0,         1,               0.945
>    144,   2048,      9,         0,               0.922
>    144,   2048,      9,         1,               0.922
>    144,   2057,      9,         0,               1.514
>    144,   2057,      9,         1,               1.514
>    160,      0,      0,         0,               0.698
>    160,      0,      0,         1,               0.698
>    160,     10,      0,         0,                0.91
>    160,     10,      0,         1,                0.91
>    160,      0,     10,         0,               1.211
>    160,      0,     10,         1,               1.212
>    160,     10,     10,         0,               1.357
>    160,     10,     10,         1,               1.357
>    160,   2048,      0,         0,               0.698
>    160,   2048,      0,         1,               0.698
>    160,   2058,      0,         0,                0.91
>    160,   2058,      0,         1,                0.91
>    160,   2048,     10,         0,               0.923
>    160,   2048,     10,         1,               0.923
>    160,   2058,     10,         0,               1.357
>    160,   2058,     10,         1,               1.357
>    176,      0,      0,         0,               0.796
>    176,      0,      0,         1,               0.796
>    176,     11,      0,         0,               0.804
>    176,     11,      0,         1,               0.804
>    176,      0,     11,         0,               0.774
>    176,      0,     11,         1,               0.774
>    176,     11,     11,         0,               0.814
>    176,     11,     11,         1,               0.814
>    176,   2048,      0,         0,               0.796
>    176,   2048,      0,         1,               0.796
>    176,   2059,      0,         0,               0.804
>    176,   2059,      0,         1,               0.804
>    176,   2048,     11,         0,               0.774
>    176,   2048,     11,         1,               0.774
>    176,   2059,     11,         0,               0.814
>    176,   2059,     11,         1,               0.814
>    192,      0,      0,         0,               0.778
>    192,      0,      0,         1,               0.778
>    192,     12,      0,         0,               0.881
>    192,     12,      0,         1,               0.881
>    192,      0,     12,         0,               1.167
>    192,      0,     12,         1,               1.167
>    192,     12,     12,         0,               0.841
>    192,     12,     12,         1,               0.841
>    192,   2048,      0,         0,               0.778
>    192,   2048,      0,         1,               0.778
>    192,   2060,      0,         0,               0.881
>    192,   2060,      0,         1,               0.881
>    192,   2048,     12,         0,               0.889
>    192,   2048,     12,         1,               0.889
>    192,   2060,     12,         0,               0.906
>    192,   2060,     12,         1,               0.906
>    208,      0,      0,         0,               0.833
>    208,      0,      0,         1,               0.833
>    208,     13,      0,         0,               0.921
>    208,     13,      0,         1,               0.921
>    208,      0,     13,         0,               0.835
>    208,      0,     13,         1,               0.833
>    208,     13,     13,         0,               1.333
>    208,     13,     13,         1,               1.333
>    208,   2048,      0,         0,               0.833
>    208,   2048,      0,         1,               0.833
>    208,   2061,      0,         0,               0.921
>    208,   2061,      0,         1,               0.921
>    208,   2048,     13,         0,               0.833
>    208,   2048,     13,         1,               0.833
>    208,   2061,     13,         0,               1.333
>    208,   2061,     13,         1,               1.333
>    224,      0,      0,         0,                0.93
>    224,      0,      0,         1,                0.93
>    224,     14,      0,         0,                 1.0
>    224,     14,      0,         1,                 1.0
>    224,      0,     14,         0,                1.15
>    224,      0,     14,         1,                1.15
>    224,     14,     14,         0,               1.452
>    224,     14,     14,         1,               1.452
>    224,   2048,      0,         0,                0.93
>    224,   2048,      0,         1,                0.93
>    224,   2062,      0,         0,                 1.0
>    224,   2062,      0,         1,                 1.0
>    224,   2048,     14,         0,               0.833
>    224,   2048,     14,         1,               0.833
>    224,   2062,     14,         0,               1.452
>    224,   2062,     14,         1,               1.452
>    240,      0,      0,         0,               0.909
>    240,      0,      0,         1,               0.909
>    240,     15,      0,         0,               0.797
>    240,     15,      0,         1,               0.797
>    240,      0,     15,         0,               0.771
>    240,      0,     15,         1,               0.771
>    240,     15,     15,         0,                0.93
>    240,     15,     15,         1,                0.93
>    240,   2048,      0,         0,               0.909
>    240,   2048,      0,         1,               0.909
>    240,   2063,      0,         0,               0.797
>    240,   2063,      0,         1,               0.797
>    240,   2048,     15,         0,               0.771
>    240,   2048,     15,         1,               0.771
>    240,   2063,     15,         0,                0.93
>    240,   2063,     15,         1,                0.93
>    272,      0,      0,         0,                 0.9
>    272,      0,      0,         1,                 0.9
>    272,     17,      0,         0,               1.015
>    272,     17,      0,         1,               1.015
>    272,      0,     17,         0,               0.926
>    272,      0,     17,         1,               0.927
>    272,     17,     17,         0,               0.892
>    272,     17,     17,         1,               0.892
>    272,   2048,      0,         0,                 0.9
>    272,   2048,      0,         1,                 0.9
>    272,   2065,      0,         0,               1.015
>    272,   2065,      0,         1,               1.015
>    272,   2048,     17,         0,               0.927
>    272,   2048,     17,         1,               0.927
>    272,   2065,     17,         0,               0.878
>    272,   2065,     17,         1,               0.878
>    288,      0,      0,         0,               0.882
>    288,      0,      0,         1,               0.882
>    288,     18,      0,         0,               0.803
>    288,     18,      0,         1,               0.803
>    288,      0,     18,         0,               0.768
>    288,      0,     18,         1,               0.768
>    288,     18,     18,         0,               0.882
>    288,     18,     18,         1,               0.882
>    288,   2048,      0,         0,               0.882
>    288,   2048,      0,         1,               0.882
>    288,   2066,      0,         0,               0.803
>    288,   2066,      0,         1,               0.803
>    288,   2048,     18,         0,               0.768
>    288,   2048,     18,         1,               0.768
>    288,   2066,     18,         0,               0.882
>    288,   2066,     18,         1,               0.882
>    304,      0,      0,         0,               0.865
>    304,      0,      0,         1,               0.865
>    304,     19,      0,         0,               0.944
>    304,     19,      0,         1,               0.944
>    304,      0,     19,         0,               0.943
>    304,      0,     19,         1,               0.943
>    304,     19,     19,         0,               0.956
>    304,     19,     19,         1,               0.956
>    304,   2048,      0,         0,               0.866
>    304,   2048,      0,         1,               0.865
>    304,   2067,      0,         0,               0.944
>    304,   2067,      0,         1,               0.944
>    304,   2048,     19,         0,               0.943
>    304,   2048,     19,         1,               0.943
>    304,   2067,     19,         0,               0.947
>    304,   2067,     19,         1,               0.947
>    320,      0,      0,         0,               0.944
>    320,      0,      0,         1,               0.944
>    320,     20,      0,         0,               0.962
>    320,     20,      0,         1,               0.962
>    320,      0,     20,         0,               1.214
>    320,      0,     20,         1,               1.214
>    320,     20,     20,         0,               1.365
>    320,     20,     20,         1,               1.365
>    320,   2048,      0,         0,               0.943
>    320,   2048,      0,         1,               0.943
>    320,   2068,      0,         0,               0.962
>    320,   2068,      0,         1,               0.962
>    320,   2048,     20,         0,               0.914
>    320,   2048,     20,         1,               0.914
>    320,   2068,     20,         0,               1.365
>    320,   2068,     20,         1,               1.365
>    336,      0,      0,         0,                 1.0
>    336,      0,      0,         1,                 1.0
>    336,     21,      0,         0,               0.986
>    336,     21,      0,         1,               0.986
>    336,      0,     21,         0,               0.853
>    336,      0,     21,         1,               0.853
>    336,     21,     21,         0,               0.843
>    336,     21,     21,         1,               0.843
>    336,   2048,      0,         0,                 1.0
>    336,   2048,      0,         1,                 1.0
>    336,   2069,      0,         0,               0.986
>    336,   2069,      0,         1,               0.986
>    336,   2048,     21,         0,               0.853
>    336,   2048,     21,         1,               0.853
>    336,   2069,     21,         0,               0.831
>    336,   2069,     21,         1,               0.831
>    352,      0,      0,         0,                0.98
>    352,      0,      0,         1,                0.98
>    352,     22,      0,         0,               0.811
>    352,     22,      0,         1,               0.811
>    352,      0,     22,         0,               0.882
>    352,      0,     22,         1,               0.882
>    352,     22,     22,         0,                 1.1
>    352,     22,     22,         1,                 1.1
>    352,   2048,      0,         0,                0.98
>    352,   2048,      0,         1,                0.98
>    352,   2070,      0,         0,               0.811
>    352,   2070,      0,         1,               0.811
>    352,   2048,     22,         0,               0.882
>    352,   2048,     22,         1,               0.882
>    352,   2070,     22,         0,                 1.1
>    352,   2070,     22,         1,                 1.1
>    368,      0,      0,         0,               1.058
>    368,      0,      0,         1,               1.058
>    368,     23,      0,         0,                 1.0
>    368,     23,      0,         1,                 1.0
>    368,      0,     23,         0,               0.948
>    368,      0,     23,         1,               0.948
>    368,     23,     23,         0,               0.723
>    368,     23,     23,         1,               0.723
>    368,   2048,      0,         0,               1.058
>    368,   2048,      0,         1,               1.058
>    368,   2071,      0,         0,                 1.0
>    368,   2071,      0,         1,                 1.0
>    368,   2048,     23,         0,               0.948
>    368,   2048,     23,         1,               0.948
>    368,   2071,     23,         0,               0.701
>    368,   2071,     23,         1,               0.701
>    384,      0,      0,         0,               1.012
>    384,      0,      0,         1,               1.012
>    384,     24,      0,         0,                1.04
>    384,     24,      0,         1,                1.04
>    384,      0,     24,         0,               1.154
>    384,      0,     24,         1,               1.154
>    384,     24,     24,         0,               1.423
>    384,     24,     24,         1,               1.423
>    384,   2048,      0,         0,               1.012
>    384,   2048,      0,         1,               1.012
>    384,   2072,      0,         0,                1.04
>    384,   2072,      0,         1,                1.04
>    384,   2048,     24,         0,                0.91
>    384,   2048,     24,         1,                0.91
>    384,   2072,     24,         0,               1.423
>    384,   2072,     24,         1,               1.423
>    400,      0,      0,         0,               0.948
>    400,      0,      0,         1,               0.948
>    400,     25,      0,         0,               0.957
>    400,     25,      0,         1,               0.957
>    400,      0,     25,         0,               1.099
>    400,      0,     25,         1,               1.069
>    400,     25,     25,         0,               0.885
>    400,     25,     25,         1,               0.885
>    400,   2048,      0,         0,               0.948
>    400,   2048,      0,         1,               0.948
>    400,   2073,      0,         0,               0.957
>    400,   2073,      0,         1,               0.957
>    400,   2048,     25,         0,                0.94
>    400,   2048,     25,         1,                0.94
>    400,   2073,     25,         0,               0.908
>    400,   2073,     25,         1,               0.908
>    416,      0,      0,         0,               1.017
>    416,      0,      0,         1,               1.017
>    416,     26,      0,         0,               0.903
>    416,     26,      0,         1,               0.903
>    416,      0,     26,         0,               0.881
>    416,      0,     26,         1,               0.881
>    416,     26,     26,         0,               1.035
>    416,     26,     26,         1,               1.035
>    416,   2048,      0,         0,               1.017
>    416,   2048,      0,         1,               1.017
>    416,   2074,      0,         0,               0.903
>    416,   2074,      0,         1,               0.903
>    416,   2048,     26,         0,               0.881
>    416,   2048,     26,         1,               0.881
>    416,   2074,     26,         0,               1.034
>    416,   2074,     26,         1,               1.035
>    432,      0,      0,         0,                 1.0
>    432,      0,      0,         1,                 1.0
>    432,     27,      0,         0,               0.933
>    432,     27,      0,         1,               0.933
>    432,      0,     27,         0,               0.941
>    432,      0,     27,         1,               0.941
>    432,     27,     27,         0,               0.953
>    432,     27,     27,         1,               0.954
>    432,   2048,      0,         0,                 1.0
>    432,   2048,      0,         1,                 1.0
>    432,   2075,      0,         0,               0.933
>    432,   2075,      0,         1,               0.933
>    432,   2048,     27,         0,               0.941
>    432,   2048,     27,         1,               0.941
>    432,   2075,     27,         0,                0.93
>    432,   2075,     27,         1,                0.93
>    448,      0,      0,         0,               0.984
>    448,      0,      0,         1,               0.984
>    448,     28,      0,         0,               0.896
>    448,     28,      0,         1,               0.896
>    448,      0,     28,         0,               1.244
>    448,      0,     28,         1,               1.244
>    448,     28,     28,         0,               1.333
>    448,     28,     28,         1,               1.333
>    448,   2048,      0,         0,               0.984
>    448,   2048,      0,         1,               0.984
>    448,   2076,      0,         0,               0.896
>    448,   2076,      0,         1,               0.896
>    448,   2048,     28,         0,               0.988
>    448,   2048,     28,         1,               0.988
>    448,   2076,     28,         0,               1.333
>    448,   2076,     28,         1,               1.333
>    464,      0,      0,         0,               1.083
>    464,      0,      0,         1,               1.083
>    464,     29,      0,         0,               0.978
>    464,     29,      0,         1,               0.978
>    464,      0,     29,         0,               0.924
>    464,      0,     29,         1,               0.924
>    464,     29,     29,         0,               0.901
>    464,     29,     29,         1,               0.901
>    464,   2048,      0,         0,               1.083
>    464,   2048,      0,         1,               1.083
>    464,   2077,      0,         0,               0.978
>    464,   2077,      0,         1,               0.978
>    464,   2048,     29,         0,               0.924
>    464,   2048,     29,         1,               0.924
>    464,   2077,     29,         0,                0.89
>    464,   2077,     29,         1,                0.89
>    480,      0,      0,         0,               1.066
>    480,      0,      0,         1,               1.066
>    480,     30,      0,         0,                 0.9
>    480,     30,      0,         1,                 0.9
>    480,      0,     30,         0,                0.88
>    480,      0,     30,         1,                0.88
>    480,     30,     30,         0,               1.083
>    480,     30,     30,         1,               1.083
>    480,   2048,      0,         0,               1.066
>    480,   2048,      0,         1,               1.066
>    480,   2078,      0,         0,                 0.9
>    480,   2078,      0,         1,                 0.9
>    480,   2048,     30,         0,                0.88
>    480,   2048,     30,         1,                0.88
>    480,   2078,     30,         0,               1.083
>    480,   2078,     30,         1,               1.083
>    496,      0,      0,         0,               1.032
>    496,      0,      0,         1,               1.032
>    496,     31,      0,         0,                0.95
>    496,     31,      0,         1,                0.95
>    496,      0,     31,         0,               1.011
>    496,      0,     31,         1,               1.011
>    496,     31,     31,         0,               0.973
>    496,     31,     31,         1,               0.973
>    496,   2048,      0,         0,               1.032
>    496,   2048,      0,         1,               1.032
>    496,   2079,      0,         0,                0.95
>    496,   2079,      0,         1,                0.95
>    496,   2048,     31,         0,               1.011
>    496,   2048,     31,         1,               1.011
>    496,   2079,     31,         0,               0.941
>    496,   2079,     31,         1,               0.941
>   1024,     32,      0,         0,               1.143
>   1024,     32,      0,         1,               1.143
>   1024,      0,     32,         0,               1.143
>   1024,      0,     32,         1,               1.143
>   1024,     32,     32,         0,               1.143
>   1024,     32,     32,         1,               1.143
>   1024,   2080,      0,         0,               1.143
>   1024,   2080,      0,         1,               1.143
>   1024,   2048,     32,         0,               1.143
>   1024,   2048,     32,         1,               1.143
>   1024,   2080,     32,         0,               1.143
>   1024,   2080,     32,         1,               1.143
>   1056,      0,      0,         0,               1.168
>   1056,      0,      0,         1,               1.168
>   1056,     33,      0,         0,               1.067
>   1056,     33,      0,         1,               1.067
>   1056,      0,     33,         0,               0.977
>   1056,      0,     33,         1,               0.977
>   1056,     33,     33,         0,               1.043
>   1056,     33,     33,         1,               1.043
>   1056,   2048,      0,         0,               1.168
>   1056,   2048,      0,         1,               1.168
>   1056,   2081,      0,         0,               1.067
>   1056,   2081,      0,         1,               1.067
>   1056,   2048,     33,         0,               0.977
>   1056,   2048,     33,         1,               0.977
>   1056,   2081,     33,         0,                 1.0
>   1056,   2081,     33,         1,                 1.0
>   1088,      0,      0,         0,               1.171
>   1088,      0,      0,         1,               1.171
>   1088,     34,      0,         0,               1.041
>   1088,     34,      0,         1,               1.041
>   1088,      0,     34,         0,               1.079
>   1088,      0,     34,         1,               1.079
>   1088,     34,     34,         0,               0.966
>   1088,     34,     34,         1,               0.966
>   1088,   2048,      0,         0,               1.171
>   1088,   2048,      0,         1,               1.171
>   1088,   2082,      0,         0,               1.041
>   1088,   2082,      0,         1,               1.041
>   1088,   2048,     34,         0,               0.994
>   1088,   2048,     34,         1,               0.994
>   1088,   2082,     34,         0,               0.966
>   1088,   2082,     34,         1,               0.966
>   1120,      0,      0,         0,               1.152
>   1120,      0,      0,         1,               1.153
>   1120,     35,      0,         0,               1.051
>   1120,     35,      0,         1,               1.051
>   1120,      0,     35,         0,                 1.0
>   1120,      0,     35,         1,                 1.0
>   1120,     35,     35,         0,               1.068
>   1120,     35,     35,         1,               1.068
>   1120,   2048,      0,         0,               1.151
>   1120,   2048,      0,         1,               1.151
>   1120,   2083,      0,         0,               1.051
>   1120,   2083,      0,         1,               1.051
>   1120,   2048,     35,         0,                 1.0
>   1120,   2048,     35,         1,                 1.0
>   1120,   2083,     35,         0,               1.027
>   1120,   2083,     35,         1,               1.027
>   1152,      0,      0,         0,               1.159
>   1152,      0,      0,         1,               1.159
>   1152,     36,      0,         0,               1.034
>   1152,     36,      0,         1,               1.034
>   1152,      0,     36,         0,                1.07
>   1152,      0,     36,         1,                1.07
>   1152,     36,     36,         0,               0.967
>   1152,     36,     36,         1,               0.967
>   1152,   2048,      0,         0,               1.159
>   1152,   2048,      0,         1,               1.159
>   1152,   2084,      0,         0,               1.034
>   1152,   2084,      0,         1,               1.034
>   1152,   2048,     36,         0,               0.984
>   1152,   2048,     36,         1,               0.984
>   1152,   2084,     36,         0,               0.967
>   1152,   2084,     36,         1,               0.967
>   1184,      0,      0,         0,               1.157
>   1184,      0,      0,         1,               1.157
>   1184,     37,      0,         0,               1.067
>   1184,     37,      0,         1,               1.066
>   1184,      0,     37,         0,               0.993
>   1184,      0,     37,         1,               0.993
>   1184,     37,     37,         0,                1.08
>   1184,     37,     37,         1,               1.081
>   1184,   2048,      0,         0,               1.157
>   1184,   2048,      0,         1,               1.157
>   1184,   2085,      0,         0,               1.066
>   1184,   2085,      0,         1,               1.066
>   1184,   2048,     37,         0,               0.993
>   1184,   2048,     37,         1,               0.993
>   1184,   2085,     37,         0,                1.04
>   1184,   2085,     37,         1,                1.04
>   1216,      0,      0,         0,               1.139
>   1216,      0,      0,         1,               1.139
>   1216,     38,      0,         0,               1.024
>   1216,     38,      0,         1,               1.024
>   1216,      0,     38,         0,               1.087
>   1216,      0,     38,         1,               1.087
>   1216,     38,     38,         0,                 1.0
>   1216,     38,     38,         1,                 1.0
>   1216,   2048,      0,         0,               1.138
>   1216,   2048,      0,         1,               1.138
>   1216,   2086,      0,         0,               1.024
>   1216,   2086,      0,         1,               1.024
>   1216,   2048,     38,         0,                1.01
>   1216,   2048,     38,         1,                1.01
>   1216,   2086,     38,         0,                 1.0
>   1216,   2086,     38,         1,                 1.0
>   1248,      0,      0,         0,               1.176
>   1248,      0,      0,         1,               1.174
>   1248,     39,      0,         0,               1.074
>   1248,     39,      0,         1,               1.074
>   1248,      0,     39,         0,               0.966
>   1248,      0,     39,         1,               0.985
>   1248,     39,     39,         0,               1.064
>   1248,     39,     39,         1,               1.064
>   1248,   2048,      0,         0,               1.179
>   1248,   2048,      0,         1,               1.179
>   1248,   2087,      0,         0,               1.074
>   1248,   2087,      0,         1,               1.074
>   1248,   2048,     39,         0,               0.985
>   1248,   2048,     39,         1,               0.985
>   1248,   2087,     39,         0,               1.026
>   1248,   2087,     39,         1,               1.026
>   1280,      0,      0,         0,               0.993
>   1280,      0,      0,         1,               0.993
>   1280,     40,      0,         0,               1.051
>   1280,     40,      0,         1,               1.051
>   1280,      0,     40,         0,               1.044
>   1280,      0,     40,         1,               1.045
>   1280,     40,     40,         0,                1.25
>   1280,     40,     40,         1,                1.25
>   1280,   2048,      0,         0,               0.992
>   1280,   2048,      0,         1,               0.992
>   1280,   2088,      0,         0,               1.051
>   1280,   2088,      0,         1,               1.051
>   1280,   2048,     40,         0,               0.946
>   1280,   2048,     40,         1,               0.946
>   1280,   2088,     40,         0,               1.252
>   1280,   2088,     40,         1,               1.252
>   1312,      0,      0,         0,               0.969
>   1312,      0,      0,         1,               0.969
>   1312,     41,      0,         0,               0.991
>   1312,     41,      0,         1,               0.991
>   1312,      0,     41,         0,               0.837
>   1312,      0,     41,         1,               0.837
>   1312,     41,     41,         0,               1.025
>   1312,     41,     41,         1,               1.025
>   1312,   2048,      0,         0,               0.969
>   1312,   2048,      0,         1,               0.969
>   1312,   2089,      0,         0,               0.991
>   1312,   2089,      0,         1,                0.99
>   1312,   2048,     41,         0,               0.837
>   1312,   2048,     41,         1,               0.837
>   1312,   2089,     41,         0,               0.975
>   1312,   2089,     41,         1,               0.975
>   1344,      0,      0,         0,               0.988
>   1344,      0,      0,         1,               0.988
>   1344,     42,      0,         0,               1.031
>   1344,     42,      0,         1,               1.031
>   1344,      0,     42,         0,               1.033
>   1344,      0,     42,         1,               1.033
>   1344,     42,     42,         0,               0.982
>   1344,     42,     42,         1,               0.982
>   1344,   2048,      0,         0,               0.992
>   1344,   2048,      0,         1,               0.992
>   1344,   2090,      0,         0,               1.031
>   1344,   2090,      0,         1,               1.031
>   1344,   2048,     42,         0,               0.943
>   1344,   2048,     42,         1,               0.942
>   1344,   2090,     42,         0,               0.982
>   1344,   2090,     42,         1,               0.982
>   1376,      0,      0,         0,               1.016
>   1376,      0,      0,         1,               1.016
>   1376,     43,      0,         0,                1.01
>   1376,     43,      0,         1,                1.01
>   1376,      0,     43,         0,               0.829
>   1376,      0,     43,         1,               0.829
>   1376,     43,     43,         0,               1.024
>   1376,     43,     43,         1,               1.024
>   1376,   2048,      0,         0,               1.006
>   1376,   2048,      0,         1,               1.015
>   1376,   2091,      0,         0,                1.01
>   1376,   2091,      0,         1,                1.01
>   1376,   2048,     43,         0,               0.829
>   1376,   2048,     43,         1,               0.829
>   1376,   2091,     43,         0,                0.98
>   1376,   2091,     43,         1,                0.98
>   1408,      0,      0,         0,               0.987
>   1408,      0,      0,         1,               0.987
>   1408,     44,      0,         0,               1.015
>   1408,     44,      0,         1,               1.015
>   1408,      0,     44,         0,               1.018
>   1408,      0,     44,         1,               1.014
>   1408,     44,     44,         0,               1.004
>   1408,     44,     44,         1,               0.994
>   1408,   2048,      0,         0,               0.988
>   1408,   2048,      0,         1,               0.988
>   1408,   2092,      0,         0,               1.015
>   1408,   2092,      0,         1,               1.015
>   1408,   2048,     44,         0,               0.955
>   1408,   2048,     44,         1,               0.955
>   1408,   2092,     44,         0,                 1.0
>   1408,   2092,     44,         1,               0.994
>   1440,      0,      0,         0,               0.986
>   1440,      0,      0,         1,               0.986
>   1440,     45,      0,         0,               1.013
>   1440,     45,      0,         1,               1.013
>   1440,      0,     45,         0,               0.814
>   1440,      0,     45,         1,               0.814
>   1440,     45,     45,         0,               1.006
>   1440,     45,     45,         1,               1.006
>   1440,   2048,      0,         0,               0.986
>   1440,   2048,      0,         1,               0.986
>   1440,   2093,      0,         0,               1.013
>   1440,   2093,      0,         1,               1.013
>   1440,   2048,     45,         0,               0.814
>   1440,   2048,     45,         1,               0.814
>   1440,   2093,     45,         0,               0.966
>   1440,   2093,     45,         1,               0.966
>   1472,      0,      0,         0,               0.997
>   1472,      0,      0,         1,               0.994
>   1472,     46,      0,         0,               1.045
>   1472,     46,      0,         1,               1.045
>   1472,      0,     46,         0,               1.026
>   1472,      0,     46,         1,               1.026
>   1472,     46,     46,         0,               0.966
>   1472,     46,     46,         1,               0.966
>   1472,   2048,      0,         0,                 1.0
>   1472,   2048,      0,         1,               0.996
>   1472,   2094,      0,         0,               1.045
>   1472,   2094,      0,         1,               1.045
>   1472,   2048,     46,         0,               0.939
>   1472,   2048,     46,         1,               0.939
>   1472,   2094,     46,         0,               0.966
>   1472,   2094,     46,         1,               0.966
>   1504,      0,      0,         0,               0.993
>   1504,      0,      0,         1,               0.993
>   1504,     47,      0,         0,               0.999
>   1504,     47,      0,         1,               0.999
>   1504,      0,     47,         0,               0.826
>   1504,      0,     47,         1,               0.826
>   1504,     47,     47,         0,               1.023
>   1504,     47,     47,         1,               1.023
>   1504,   2048,      0,         0,               0.993
>   1504,   2048,      0,         1,               0.993
>   1504,   2095,      0,         0,               0.999
>   1504,   2095,      0,         1,               0.999
>   1504,   2048,     47,         0,               0.826
>   1504,   2048,     47,         1,               0.826
>   1504,   2095,     47,         0,               0.993
>   1504,   2095,     47,         1,               0.993
>   1536,      0,      0,         0,               0.992
>   1536,      0,      0,         1,               0.991
>   1536,     48,      0,         0,               1.019
>   1536,     48,      0,         1,               1.019
>   1536,      0,     48,         0,               1.025
>   1536,      0,     48,         1,               1.024
>   1536,     48,     48,         0,               0.994
>   1536,     48,     48,         1,               0.994
>   1536,   2048,      0,         0,               0.994
>   1536,   2048,      0,         1,               0.994
>   1536,   2096,      0,         0,               1.019
>   1536,   2096,      0,         1,               1.019
>   1536,   2048,     48,         0,               1.025
>   1536,   2048,     48,         1,               1.025
>   1536,   2096,     48,         0,               0.994
>   1536,   2096,     48,         1,               0.994
>   1568,      0,      0,         0,               0.994
>   1568,      0,      0,         1,               0.994
>   1568,     49,      0,         0,               0.903
>   1568,     49,      0,         1,               0.903
>   1568,      0,     49,         0,               1.144
>   1568,      0,     49,         1,               1.144
>   1568,     49,     49,         0,               1.461
>   1568,     49,     49,         1,               1.461
>   1568,   2048,      0,         0,               0.993
>   1568,   2048,      0,         1,               0.993
>   1568,   2097,      0,         0,               0.903
>   1568,   2097,      0,         1,               0.903
>   1568,   2048,     49,         0,                1.09
>   1568,   2048,     49,         1,                1.09
>   1568,   2097,     49,         0,                1.46
>   1568,   2097,     49,         1,                1.46
>   1600,      0,      0,         0,               0.981
>   1600,      0,      0,         1,               0.981
>   1600,     50,      0,         0,               1.022
>   1600,     50,      0,         1,               1.022
>   1600,      0,     50,         0,               1.017
>   1600,      0,     50,         1,               1.017
>   1600,     50,     50,         0,               0.973
>   1600,     50,     50,         1,               0.973
>   1600,   2048,      0,         0,               0.981
>   1600,   2048,      0,         1,               0.981
>   1600,   2098,      0,         0,               1.022
>   1600,   2098,      0,         1,               1.022
>   1600,   2048,     50,         0,               0.961
>   1600,   2048,     50,         1,               0.961
>   1600,   2098,     50,         0,               0.973
>   1600,   2098,     50,         1,               0.973
>   1632,      0,      0,         0,               1.019
>   1632,      0,      0,         1,               1.019
>   1632,     51,      0,         0,               0.893
>   1632,     51,      0,         1,               0.893
>   1632,      0,     51,         0,               1.131
>   1632,      0,     51,         1,               1.131
>   1632,     51,     51,         0,               1.444
>   1632,     51,     51,         1,               1.444
>   1632,   2048,      0,         0,               1.019
>   1632,   2048,      0,         1,               1.019
>   1632,   2099,      0,         0,               0.893
>   1632,   2099,      0,         1,               0.893
>   1632,   2048,     51,         0,               1.079
>   1632,   2048,     51,         1,               1.079
>   1632,   2099,     51,         0,               1.449
>   1632,   2099,     51,         1,               1.449
>   1664,      0,      0,         0,               1.005
>   1664,      0,      0,         1,               1.004
>   1664,     52,      0,         0,               0.986
>   1664,     52,      0,         1,               0.986
>   1664,      0,     52,         0,               1.004
>   1664,      0,     52,         1,               1.004
>   1664,     52,     52,         0,               0.976
>   1664,     52,     52,         1,               0.976
>   1664,   2048,      0,         0,               1.006
>   1664,   2048,      0,         1,               1.006
>   1664,   2100,      0,         0,               0.993
>   1664,   2100,      0,         1,               0.993
>   1664,   2048,     52,         0,               0.946
>   1664,   2048,     52,         1,               0.946
>   1664,   2100,     52,         0,               0.976
>   1664,   2100,     52,         1,               0.976
>   1696,      0,      0,         0,               0.994
>   1696,      0,      0,         1,               0.992
>   1696,     53,      0,         0,               0.884
>   1696,     53,      0,         1,               0.884
>   1696,      0,     53,         0,               1.141
>   1696,      0,     53,         1,               1.141
>   1696,     53,     53,         0,                1.43
>   1696,     53,     53,         1,                1.43
>   1696,   2048,      0,         0,               0.994
>   1696,   2048,      0,         1,               0.994
>   1696,   2101,      0,         0,               0.884
>   1696,   2101,      0,         1,               0.884
>   1696,   2048,     53,         0,               1.088
>   1696,   2048,     53,         1,               1.088
>   1696,   2101,     53,         0,               1.429
>   1696,   2101,     53,         1,               1.429
>   1728,      0,      0,         0,               0.978
>   1728,      0,      0,         1,               0.978
>   1728,     54,      0,         0,               1.031
>   1728,     54,      0,         1,               1.033
>   1728,      0,     54,         0,                 1.0
>   1728,      0,     54,         1,                 1.0
>   1728,     54,     54,         0,                0.96
>   1728,     54,     54,         1,                0.96
>   1728,   2048,      0,         0,               0.976
>   1728,   2048,      0,         1,               0.976
>   1728,   2102,      0,         0,               1.033
>   1728,   2102,      0,         1,               1.033
>   1728,   2048,     54,         0,               0.947
>   1728,   2048,     54,         1,               0.947
>   1728,   2102,     54,         0,                0.96
>   1728,   2102,     54,         1,                0.96
>   1760,      0,      0,         0,               1.019
>   1760,      0,      0,         1,               1.021
>   1760,     55,      0,         0,                 0.9
>   1760,     55,      0,         1,                 0.9
>   1760,      0,     55,         0,               1.125
>   1760,      0,     55,         1,               1.125
>   1760,     55,     55,         0,               1.437
>   1760,     55,     55,         1,               1.436
>   1760,   2048,      0,         0,               1.016
>   1760,   2048,      0,         1,               1.015
>   1760,   2103,      0,         0,                 0.9
>   1760,   2103,      0,         1,                 0.9
>   1760,   2048,     55,         0,               1.073
>   1760,   2048,     55,         1,               1.074
>   1760,   2103,     55,         0,                1.44
>   1760,   2103,     55,         1,                1.44
>   1792,      0,      0,         0,               1.002
>   1792,      0,      0,         1,               1.002
>   1792,     56,      0,         0,               1.028
>   1792,     56,      0,         1,               1.028
>   1792,      0,     56,         0,               1.014
>   1792,      0,     56,         1,               1.015
>   1792,     56,     56,         0,               1.191
>   1792,     56,     56,         1,               1.191
>   1792,   2048,      0,         0,               1.003
>   1792,   2048,      0,         1,               1.003
>   1792,   2104,      0,         0,               1.028
>   1792,   2104,      0,         1,               1.028
>   1792,   2048,     56,         0,               0.963
>   1792,   2048,     56,         1,               0.963
>   1792,   2104,     56,         0,               1.191
>   1792,   2104,     56,         1,               1.191
>   1824,      0,      0,         0,               0.999
>   1824,      0,      0,         1,                 1.0
>   1824,     57,      0,         0,               0.891
>   1824,     57,      0,         1,               0.891
>   1824,      0,     57,         0,               1.114
>   1824,      0,     57,         1,               1.114
>   1824,     57,     57,         0,               1.407
>   1824,     57,     57,         1,               1.407
>   1824,   2048,      0,         0,               1.001
>   1824,   2048,      0,         1,               1.001
>   1824,   2105,      0,         0,               0.891
>   1824,   2105,      0,         1,               0.891
>   1824,   2048,     57,         0,               1.064
>   1824,   2048,     57,         1,               1.064
>   1824,   2105,     57,         0,               1.407
>   1824,   2105,     57,         1,               1.407
>   1856,      0,      0,         0,               0.989
>   1856,      0,      0,         1,               0.987
>   1856,     58,      0,         0,               1.042
>   1856,     58,      0,         1,               1.042
>   1856,      0,     58,         0,               1.007
>   1856,      0,     58,         1,               1.007
>   1856,     58,     58,         0,               0.978
>   1856,     58,     58,         1,               0.972
>   1856,   2048,      0,         0,               0.992
>   1856,   2048,      0,         1,               0.992
>   1856,   2106,      0,         0,               1.042
>   1856,   2106,      0,         1,               1.042
>   1856,   2048,     58,         0,               0.954
>   1856,   2048,     58,         1,               0.954
>   1856,   2106,     58,         0,               0.979
>   1856,   2106,     58,         1,               0.972
>   1888,      0,      0,         0,               0.994
>   1888,      0,      0,         1,               0.994
>   1888,     59,      0,         0,               0.883
>   1888,     59,      0,         1,               0.883
>   1888,      0,     59,         0,               1.121
>   1888,      0,     59,         1,               1.123
>   1888,     59,     59,         0,               1.413
>   1888,     59,     59,         1,               1.413
>   1888,   2048,      0,         0,               0.985
>   1888,   2048,      0,         1,               0.994
>   1888,   2107,      0,         0,               0.883
>   1888,   2107,      0,         1,               0.883
>   1888,   2048,     59,         0,               1.076
>   1888,   2048,     59,         1,               1.076
>   1888,   2107,     59,         0,               1.413
>   1888,   2107,     59,         1,               1.413
>   1920,      0,      0,         0,                 1.0
>   1920,      0,      0,         1,               0.999
>   1920,     60,      0,         0,               1.033
>   1920,     60,      0,         1,               1.033
>   1920,      0,     60,         0,               0.996
>   1920,      0,     60,         1,               0.997
>   1920,     60,     60,         0,               0.968
>   1920,     60,     60,         1,               0.968
>   1920,   2048,      0,         0,                 1.0
>   1920,   2048,      0,         1,                 1.0
>   1920,   2108,      0,         0,               1.034
>   1920,   2108,      0,         1,               1.034
>   1920,   2048,     60,         0,               0.949
>   1920,   2048,     60,         1,               0.949
>   1920,   2108,     60,         0,               0.968
>   1920,   2108,     60,         1,               0.968
>   1952,      0,      0,         0,               1.004
>   1952,      0,      0,         1,               1.004
>   1952,     61,      0,         0,               0.898
>   1952,     61,      0,         1,               0.898
>   1952,      0,     61,         0,               1.118
>   1952,      0,     61,         1,               1.118
>   1952,     61,     61,         0,               1.387
>   1952,     61,     61,         1,               1.387
>   1952,   2048,      0,         0,               1.004
>   1952,   2048,      0,         1,               1.004
>   1952,   2109,      0,         0,               0.898
>   1952,   2109,      0,         1,               0.898
>   1952,   2048,     61,         0,               1.071
>   1952,   2048,     61,         1,               1.071
>   1952,   2109,     61,         0,               1.387
>   1952,   2109,     61,         1,               1.387
>   1984,      0,      0,         0,               0.993
>   1984,      0,      0,         1,               0.993
>   1984,     62,      0,         0,               1.025
>   1984,     62,      0,         1,               1.025
>   1984,      0,     62,         0,               1.005
>   1984,      0,     62,         1,               1.007
>   1984,     62,     62,         0,               0.982
>   1984,     62,     62,         1,               0.982
>   1984,   2048,      0,         0,               0.993
>   1984,   2048,      0,         1,               0.993
>   1984,   2110,      0,         0,               1.025
>   1984,   2110,      0,         1,               1.025
>   1984,   2048,     62,         0,                0.96
>   1984,   2048,     62,         1,                0.96
>   1984,   2110,     62,         0,               0.982
>   1984,   2110,     62,         1,               0.982
>   2016,      0,      0,         0,                 1.0
>   2016,      0,      0,         1,               0.999
>   2016,     63,      0,         0,               0.889
>   2016,     63,      0,         1,                0.89
>   2016,      0,     63,         0,               1.091
>   2016,      0,     63,         1,               1.092
>   2016,     63,     63,         0,               1.362
>   2016,     63,     63,         1,               1.363
>   2016,   2048,      0,         0,                 1.0
>   2016,   2048,      0,         1,                 1.0
>   2016,   2111,      0,         0,               0.965
>   2016,   2111,      0,         1,               0.965
>   2016,   2048,     63,         0,               1.049
>   2016,   2048,     63,         1,               1.049
>   2016,   2111,     63,         0,               1.405
>   2016,   2111,     63,         1,               1.405
>   2048,     32,      0,         0,                1.01
>   2048,     32,      0,         1,                1.01
>   2048,      0,     32,         0,               1.005
>   2048,      0,     32,         1,               1.005
>   2048,     32,     32,         0,               1.005
>   2048,     32,     32,         1,               1.005
>   2048,      0,      1,         0,               0.983
>   2048,      0,      1,         1,               0.984
>   2048,      1,      0,         0,               1.039
>   2048,      1,      0,         1,               1.039
>   2048,     32,      1,         0,               1.063
>   2048,     32,      1,         1,               1.063
>   2048,      1,     32,         0,                0.94
>   2048,      1,     32,         1,                0.94
>   2048,   2048,      1,         0,               0.981
>   2048,   2048,      1,         1,               0.981
>   2048,   2049,      0,         0,               0.904
>   2048,   2049,      0,         1,               0.904
>   2112,      0,      0,         0,               0.996
>   2112,      0,      0,         1,               0.995
>   2112,      1,      0,         0,               1.031
>   2112,      1,      0,         1,               1.031
>   2112,     33,      0,         0,                1.01
>   2112,     33,      0,         1,                1.01
>   2112,      0,      1,         0,               0.972
>   2112,      0,      1,         1,               0.972
>   2112,      0,     33,         0,               0.987
>   2112,      0,     33,         1,               0.987
>   2112,      1,      1,         0,               0.914
>   2112,      1,      1,         1,               0.914
>   2112,     33,     33,         0,               0.983
>   2112,     33,     33,         1,               0.983
>   2112,   2048,      0,         0,               0.994
>   2112,   2048,      0,         1,                0.99
>   2112,   2049,      0,         0,               1.031
>   2112,   2049,      0,         1,               1.031
>   2112,   2048,      1,         0,               0.955
>   2112,   2048,      1,         1,               0.955
>   2112,   2049,      1,         0,               0.906
>   2112,   2049,      1,         1,               0.906
>   2112,     33,      1,         0,               1.163
>   2112,     33,      1,         1,               1.164
>   2112,      1,     33,         0,               1.046
>   2112,      1,     33,         1,               1.046
>   2176,      0,      0,         0,               0.984
>   2176,      0,      0,         1,               0.985
>   2176,      2,      0,         0,               1.023
>   2176,      2,      0,         1,               1.023
>   2176,     34,      0,         0,                 1.0
>   2176,     34,      0,         1,                 1.0
>   2176,      0,      2,         0,               0.985
>   2176,      0,      2,         1,               0.985
>   2176,      0,     34,         0,               0.995
>   2176,      0,     34,         1,               0.982
>   2176,      2,      2,         0,               0.928
>   2176,      2,      2,         1,               0.928
>   2176,     34,     34,         0,               1.004
>   2176,     34,     34,         1,               1.004
>   2176,   2048,      0,         0,               0.985
>   2176,   2048,      0,         1,               0.986
>   2176,   2050,      0,         0,               1.023
>   2176,   2050,      0,         1,               1.023
>   2176,   2048,      2,         0,               0.802
>   2176,   2048,      2,         1,               0.802
>   2176,   2050,      2,         0,               0.894
>   2176,   2050,      2,         1,               0.894
>   2176,      2,      1,         0,               1.068
>   2176,      2,      1,         1,               1.068
>   2176,      1,      2,         0,               0.976
>   2176,      1,      2,         1,               0.976
>   2176,     34,      1,         0,               1.077
>   2176,     34,      1,         1,               1.077
>   2176,      1,     34,         0,               0.978
>   2176,      1,     34,         1,               0.978
>   2176,   2050,      1,         0,               1.061
>   2176,   2050,      1,         1,               1.061
>   2176,   2049,      2,         0,               0.971
>   2176,   2049,      2,         1,               0.971
>   2240,      0,      0,         0,               0.994
>   2240,      0,      0,         1,               0.994
>   2240,      3,      0,         0,               1.038
>   2240,      3,      0,         1,               1.039
>   2240,     35,      0,         0,               1.019
>   2240,     35,      0,         1,               1.019
>   2240,      0,      3,         0,               0.979
>   2240,      0,      3,         1,                0.98
>   2240,      0,     35,         0,               0.991
>   2240,      0,     35,         1,               0.991
>   2240,      3,      3,         0,               0.931
>   2240,      3,      3,         1,               0.931
>   2240,     35,     35,         0,               0.999
>   2240,     35,     35,         1,               0.999
>   2240,   2048,      0,         0,               0.995
>   2240,   2048,      0,         1,               0.995
>   2240,   2051,      0,         0,               1.039
>   2240,   2051,      0,         1,               1.039
>   2240,   2048,      3,         0,               0.799
>   2240,   2048,      3,         1,               0.799
>   2240,   2051,      3,         0,               0.889
>   2240,   2051,      3,         1,               0.889
>   2240,      3,      1,         0,                1.06
>   2240,      3,      1,         1,                1.06
>   2240,      1,      3,         0,               0.968
>   2240,      1,      3,         1,               0.968
>   2240,     35,      1,         0,               1.071
>   2240,     35,      1,         1,               1.071
>   2240,      1,     35,         0,               0.971
>   2240,      1,     35,         1,               0.971
>   2240,   2051,      1,         0,               1.057
>   2240,   2051,      1,         1,               1.057
>   2240,   2049,      3,         0,               0.966
>   2240,   2049,      3,         1,               0.966
>   2304,      0,      0,         0,               0.986
>   2304,      0,      0,         1,               0.986
>   2304,      4,      0,         0,               1.031
>   2304,      4,      0,         1,               1.032
>   2304,     36,      0,         0,               1.011
>   2304,     36,      0,         1,               1.011
>   2304,      0,      4,         0,               0.968
>   2304,      0,      4,         1,               0.969
>   2304,      0,     36,         0,               0.988
>   2304,      0,     36,         1,               0.988
>   2304,      4,      4,         0,                0.93
>   2304,      4,      4,         1,               0.931
>   2304,     36,     36,         0,               0.992
>   2304,     36,     36,         1,               0.992
>   2304,   2048,      0,         0,               0.988
>   2304,   2048,      0,         1,               0.988
>   2304,   2052,      0,         0,               1.032
>   2304,   2052,      0,         1,               1.032
>   2304,   2048,      4,         0,               0.793
>   2304,   2048,      4,         1,               0.793
>   2304,   2052,      4,         0,               0.884
>   2304,   2052,      4,         1,               0.884
>   2304,      4,      1,         0,               0.989
>   2304,      4,      1,         1,               0.989
>   2304,      1,      4,         0,               0.897
>   2304,      1,      4,         1,               0.898
>   2304,     36,      1,         0,               1.057
>   2304,     36,      1,         1,               1.057
>   2304,      1,     36,         0,               0.966
>   2304,      1,     36,         1,               0.966
>   2304,   2052,      1,         0,               1.052
>   2304,   2052,      1,         1,               1.052
>   2304,   2049,      4,         0,               0.955
>   2304,   2049,      4,         1,               0.955
>   2368,      0,      0,         0,                 1.0
>   2368,      0,      0,         1,               1.001
>   2368,      5,      0,         0,               1.024
>   2368,      5,      0,         1,               1.025
>   2368,     37,      0,         0,                 1.0
>   2368,     37,      0,         1,                 1.0
>   2368,      0,      5,         0,                0.98
>   2368,      0,      5,         1,               0.981
>   2368,      0,     37,         0,               0.983
>   2368,      0,     37,         1,                0.98
>   2368,      5,      5,         0,               0.944
>   2368,      5,      5,         1,               0.944
>   2368,     37,     37,         0,               1.003
>   2368,     37,     37,         1,               1.003
>   2368,   2048,      0,         0,               1.002
>   2368,   2048,      0,         1,               1.002
>   2368,   2053,      0,         0,               1.025
>   2368,   2053,      0,         1,               1.025
>   2368,   2048,      5,         0,               0.801
>   2368,   2048,      5,         1,               0.801
>   2368,   2053,      5,         0,               0.907
>   2368,   2053,      5,         1,               0.907
>   2368,      5,      1,         0,               1.071
>   2368,      5,      1,         1,               1.071
>   2368,      1,      5,         0,               0.973
>   2368,      1,      5,         1,               0.973
>   2368,     37,      1,         0,                1.07
>   2368,     37,      1,         1,                1.07
>   2368,      1,     37,         0,               0.974
>   2368,      1,     37,         1,               0.974
>   2368,   2053,      1,         0,               1.065
>   2368,   2053,      1,         1,               1.065
>   2368,   2049,      5,         0,               0.967
>   2368,   2049,      5,         1,               0.967
>   2432,      0,      0,         0,               0.965
>   2432,      0,      0,         1,                 1.0
>   2432,      6,      0,         0,               1.038
>   2432,      6,      0,         1,               1.039
>   2432,     38,      0,         0,               1.021
>   2432,     38,      0,         1,               1.021
>   2432,      0,      6,         0,               0.974
>   2432,      0,      6,         1,               0.976
>   2432,      0,     38,         0,               0.986
>   2432,      0,     38,         1,               0.986
>   2432,      6,      6,         0,               0.926
>   2432,      6,      6,         1,               0.926
>   2432,     38,     38,         0,                 1.0
>   2432,     38,     38,         1,                 1.0
>   2432,   2048,      0,         0,               1.004
>   2432,   2048,      0,         1,               1.004
>   2432,   2054,      0,         0,               1.039
>   2432,   2054,      0,         1,               1.039
>   2432,   2048,      6,         0,               0.797
>   2432,   2048,      6,         1,               0.797
>   2432,   2054,      6,         0,               0.898
>   2432,   2054,      6,         1,               0.898
>   2432,      6,      1,         0,               1.063
>   2432,      6,      1,         1,               1.063
>   2432,      1,      6,         0,               0.965
>   2432,      1,      6,         1,               0.965
>   2432,     38,      1,         0,               1.068
>   2432,     38,      1,         1,               1.068
>   2432,      1,     38,         0,               0.968
>   2432,      1,     38,         1,               0.968
>   2432,   2054,      1,         0,                1.06
>   2432,   2054,      1,         1,                1.06
>   2432,   2049,      6,         0,               0.963
>   2432,   2049,      6,         1,               0.963
>   2496,      0,      0,         0,               1.013
>   2496,      0,      0,         1,               1.013
>   2496,      7,      0,         0,               1.032
>   2496,      7,      0,         1,               1.032
>   2496,     39,      0,         0,               1.013
>   2496,     39,      0,         1,               1.013
>   2496,      0,      7,         0,               0.965
>   2496,      0,      7,         1,               0.965
>   2496,      0,     39,         0,               0.979
>   2496,      0,     39,         1,               0.979
>   2496,      7,      7,         0,               0.925
>   2496,      7,      7,         1,               0.925
>   2496,     39,     39,         0,               0.989
>   2496,     39,     39,         1,               0.989
>   2496,   2048,      0,         0,               1.013
>   2496,   2048,      0,         1,               1.013
>   2496,   2055,      0,         0,               1.032
>   2496,   2055,      0,         1,               1.032
>   2496,   2048,      7,         0,               0.792
>   2496,   2048,      7,         1,               0.792
>   2496,   2055,      7,         0,                0.93
>   2496,   2055,      7,         1,                0.93
>   2496,      7,      1,         0,               0.984
>   2496,      7,      1,         1,               0.984
>   2496,      1,      7,         0,               0.894
>   2496,      1,      7,         1,               0.895
>   2496,     39,      1,         0,               1.054
>   2496,     39,      1,         1,               1.054
>   2496,      1,     39,         0,               0.963
>   2496,      1,     39,         1,               0.963
>   2496,   2055,      1,         0,               1.049
>   2496,   2055,      1,         1,               1.049
>   2496,   2049,      7,         0,               0.953
>   2496,   2049,      7,         1,               0.953
>   2560,      0,      0,         0,               0.991
>   2560,      0,      0,         1,               0.991
>   2560,      8,      0,         0,               1.031
>   2560,      8,      0,         1,               1.032
>   2560,     40,      0,         0,               1.029
>   2560,     40,      0,         1,               1.029
>   2560,      0,      8,         0,               0.992
>   2560,      0,      8,         1,               0.992
>   2560,      0,     40,         0,               0.975
>   2560,      0,     40,         1,               0.984
>   2560,      8,      8,         0,               0.942
>   2560,      8,      8,         1,               0.943
>   2560,     40,     40,         0,               1.139
>   2560,     40,     40,         1,               1.139
>   2560,   2048,      0,         0,               0.993
>   2560,   2048,      0,         1,               0.993
>   2560,   2056,      0,         0,               1.032
>   2560,   2056,      0,         1,               1.032
>   2560,   2048,      8,         0,               0.812
>   2560,   2048,      8,         1,               0.812
>   2560,   2056,      8,         0,               0.912
>   2560,   2056,      8,         1,               0.912
>   2560,      8,      1,         0,               1.068
>   2560,      8,      1,         1,               1.069
>   2560,      1,      8,         0,               0.974
>   2560,      1,      8,         1,               0.974
>   2560,     40,      1,         0,               1.068
>   2560,     40,      1,         1,               1.068
>   2560,      1,     40,         0,               0.996
>   2560,      1,     40,         1,               0.996
>   2560,   2056,      1,         0,               1.063
>   2560,   2056,      1,         1,               1.063
>   2560,   2049,      8,         0,               0.969
>   2560,   2049,      8,         1,               0.969
>   2624,      0,      0,         0,               0.995
>   2624,      0,      0,         1,               0.994
>   2624,      9,      0,         0,               1.015
>   2624,      9,      0,         1,               1.018
>   2624,     41,      0,         0,               1.044
>   2624,     41,      0,         1,               1.044
>   2624,      0,      9,         0,               0.988
>   2624,      0,      9,         1,                0.99
>   2624,      0,     41,         0,               0.989
>   2624,      0,     41,         1,                0.99
>   2624,      9,      9,         0,               0.943
>   2624,      9,      9,         1,               0.943
>   2624,     41,     41,         0,               0.993
>   2624,     41,     41,         1,               0.993
>   2624,   2048,      0,         0,               0.998
>   2624,   2048,      0,         1,               0.998
>   2624,   2057,      0,         0,               1.018
>   2624,   2057,      0,         1,               1.018
>   2624,   2048,      9,         0,                0.81
>   2624,   2048,      9,         1,                0.81
>   2624,   2057,      9,         0,               0.907
>   2624,   2057,      9,         1,               0.907
>   2624,      9,      1,         0,                1.09
>   2624,      9,      1,         1,                1.09
>   2624,      1,      9,         0,               0.967
>   2624,      1,      9,         1,               0.967
>   2624,     41,      1,         0,               1.084
>   2624,     41,      1,         1,               1.085
>   2624,      1,     41,         0,               0.958
>   2624,      1,     41,         1,               0.957
>   2624,   2057,      1,         0,               1.087
>   2624,   2057,      1,         1,               1.087
>   2624,   2049,      9,         0,               0.965
>   2624,   2049,      9,         1,               0.965
>   2688,      0,      0,         0,               0.995
>   2688,      0,      0,         1,               0.995
>   2688,     10,      0,         0,                1.01
>   2688,     10,      0,         1,               1.012
>   2688,     42,      0,         0,               1.036
>   2688,     42,      0,         1,               1.036
>   2688,      0,     10,         0,               0.978
>   2688,      0,     10,         1,               0.979
>   2688,      0,     42,         0,               0.977
>   2688,      0,     42,         1,               0.978
>   2688,     10,     10,         0,               0.942
>   2688,     10,     10,         1,               0.942
>   2688,     42,     42,         0,               0.989
>   2688,     42,     42,         1,               0.989
>   2688,   2048,      0,         0,               0.995
>   2688,   2048,      0,         1,               0.995
>   2688,   2058,      0,         0,               1.012
>   2688,   2058,      0,         1,               1.012
>   2688,   2048,     10,         0,               0.804
>   2688,   2048,     10,         1,               0.804
>   2688,   2058,     10,         0,               0.905
>   2688,   2058,     10,         1,               0.905
>   2688,     10,      1,         0,               0.986
>   2688,     10,      1,         1,               0.987
>   2688,      1,     10,         0,               0.893
>   2688,      1,     10,         1,               0.894
>   2688,     42,      1,         0,               1.054
>   2688,     42,      1,         1,               1.054
>   2688,      1,     42,         0,               0.958
>   2688,      1,     42,         1,               0.958
>   2688,   2058,      1,         0,               1.052
>   2688,   2058,      1,         1,               1.052
>   2688,   2049,     10,         0,               0.954
>   2688,   2049,     10,         1,               0.954
>   2752,      0,      0,         0,                 1.0
>   2752,      0,      0,         1,               0.992
>   2752,     11,      0,         0,               0.954
>   2752,     11,      0,         1,               0.954
>   2752,     43,      0,         0,               0.979
>   2752,     43,      0,         1,               0.979
>   2752,      0,     11,         0,               0.939
>   2752,      0,     11,         1,               0.939
>   2752,      0,     43,         0,               0.931
>   2752,      0,     43,         1,               0.932
>   2752,     11,     11,         0,               0.949
>   2752,     11,     11,         1,               0.949
>   2752,     43,     43,         0,               1.007
>   2752,     43,     43,         1,               1.007
>   2752,   2048,      0,         0,               0.993
>   2752,   2048,      0,         1,               0.993
>   2752,   2059,      0,         0,               0.954
>   2752,   2059,      0,         1,               0.954
>   2752,   2048,     11,         0,                0.77
>   2752,   2048,     11,         1,                0.77
>   2752,   2059,     11,         0,               0.916
>   2752,   2059,     11,         1,               0.916
>   2752,     11,      1,         0,               0.994
>   2752,     11,      1,         1,               0.994
>   2752,      1,     11,         0,               0.928
>   2752,      1,     11,         1,               0.928
>   2752,     43,      1,         0,               1.022
>   2752,     43,      1,         1,               1.022
>   2752,      1,     43,         0,                0.92
>   2752,      1,     43,         1,                0.92
>   2752,   2059,      1,         0,               0.989
>   2752,   2059,      1,         1,               0.989
>   2752,   2049,     11,         0,               0.923
>   2752,   2049,     11,         1,               0.923
>   2816,      0,      0,         0,               1.003
>   2816,      0,      0,         1,               1.003
>   2816,     12,      0,         0,               0.897
>   2816,     12,      0,         1,               0.894
>   2816,     44,      0,         0,               0.914
>   2816,     44,      0,         1,               0.914
>   2816,      0,     12,         0,               0.876
>   2816,      0,     12,         1,               0.874
>   2816,      0,     44,         0,               0.871
>   2816,      0,     44,         1,                0.87
>   2816,     12,     12,         0,               0.948
>   2816,     12,     12,         1,               0.948
>   2816,     44,     44,         0,               1.009
>   2816,     44,     44,         1,               1.009
>   2816,   2048,      0,         0,               1.005
>   2816,   2048,      0,         1,               1.005
>   2816,   2060,      0,         0,               0.894
>   2816,   2060,      0,         1,               0.894
>   2816,   2048,     12,         0,               0.714
>   2816,   2048,     12,         1,               0.713
>   2816,   2060,     12,         0,               0.915
>   2816,   2060,     12,         1,               0.915
>   2816,     12,      1,         0,               0.917
>   2816,     12,      1,         1,               0.917
>   2816,      1,     12,         0,               0.858
>   2816,      1,     12,         1,               0.857
>   2816,     44,      1,         0,               0.944
>   2816,     44,      1,         1,               0.943
>   2816,      1,     44,         0,               0.856
>   2816,      1,     44,         1,               0.856
>   2816,   2060,      1,         0,               0.914
>   2816,   2060,      1,         1,               0.914
>   2816,   2049,     12,         0,               0.855
>   2816,   2049,     12,         1,               0.855
>   2880,      0,      0,         0,               0.989
>   2880,      0,      0,         1,               0.989
>   2880,     13,      0,         0,               0.967
>   2880,     13,      0,         1,               0.967
>   2880,     45,      0,         0,               0.987
>   2880,     45,      0,         1,               0.987
>   2880,      0,     13,         0,               0.925
>   2880,      0,     13,         1,               0.925
>   2880,      0,     45,         0,               0.927
>   2880,      0,     45,         1,               0.927
>   2880,     13,     13,         0,               0.944
>   2880,     13,     13,         1,               0.944
>   2880,     45,     45,         0,               1.003
>   2880,     45,     45,         1,               1.003
>   2880,   2048,      0,         0,               0.989
>   2880,   2048,      0,         1,               0.989
>   2880,   2061,      0,         0,               0.967
>   2880,   2061,      0,         1,               0.967
>   2880,   2048,     13,         0,                0.76
>   2880,   2048,     13,         1,                0.76
>   2880,   2061,     13,         0,                0.91
>   2880,   2061,     13,         1,                0.91
>   2880,     13,      1,         0,               0.922
>   2880,     13,      1,         1,               0.922
>   2880,      1,     13,         0,               0.859
>   2880,      1,     13,         1,               0.859
>   2880,     45,      1,         0,               1.013
>   2880,     45,      1,         1,               1.013
>   2880,      1,     45,         0,                0.92
>   2880,      1,     45,         1,                0.92
>   2880,   2061,      1,         0,               0.984
>   2880,   2061,      1,         1,               0.984
>   2880,   2049,     13,         0,               0.918
>   2880,   2049,     13,         1,               0.918
>   2944,      0,      0,         0,               1.014
>   2944,      0,      0,         1,               1.014
>   2944,     14,      0,         0,               0.956
>   2944,     14,      0,         1,               0.955
>   2944,     46,      0,         0,               0.979
>   2944,     46,      0,         1,               0.979
>   2944,      0,     14,         0,               0.937
>   2944,      0,     14,         1,               0.937
>   2944,      0,     46,         0,                0.93
>   2944,      0,     46,         1,                0.93
>   2944,     14,     14,         0,               0.953
>   2944,     14,     14,         1,               0.953
>   2944,     46,     46,         0,               1.009
>   2944,     46,     46,         1,               1.009
>   2944,   2048,      0,         0,               1.015
>   2944,   2048,      0,         1,               1.015
>   2944,   2062,      0,         0,               0.955
>   2944,   2062,      0,         1,               0.955
>   2944,   2048,     14,         0,               0.769
>   2944,   2048,     14,         1,               0.769
>   2944,   2062,     14,         0,               0.923
>   2944,   2062,     14,         1,               0.923
>   2944,     14,      1,         0,               0.994
>   2944,     14,      1,         1,               0.994
>   2944,      1,     14,         0,               0.927
>   2944,      1,     14,         1,               0.927
>   2944,     46,      1,         0,               1.021
>   2944,     46,      1,         1,               1.021
>   2944,      1,     46,         0,               0.923
>   2944,      1,     46,         1,               0.923
>   2944,   2062,      1,         0,               0.988
>   2944,   2062,      1,         1,               0.988
>   2944,   2049,     14,         0,               0.922
>   2944,   2049,     14,         1,               0.922
>   3008,      0,      0,         0,               0.994
>   3008,      0,      0,         1,               0.994
>   3008,     15,      0,         0,               0.941
>   3008,     15,      0,         1,               0.941
>   3008,     47,      0,         0,               0.996
>   3008,     47,      0,         1,               0.996
>   3008,      0,     15,         0,               0.929
>   3008,      0,     15,         1,               0.933
>   3008,      0,     47,         0,               0.933
>   3008,      0,     47,         1,               0.933
>   3008,     15,     15,         0,               0.952
>   3008,     15,     15,         1,               0.949
>   3008,     47,     47,         0,               1.003
>   3008,     47,     47,         1,               1.003
>   3008,   2048,      0,         0,               0.998
>   3008,   2048,      0,         1,               0.998
>   3008,   2063,      0,         0,               0.941
>   3008,   2063,      0,         1,               0.941
>   3008,   2048,     15,         0,               0.766
>   3008,   2048,     15,         1,               0.766
>   3008,   2063,     15,         0,               0.916
>   3008,   2063,     15,         1,               0.916
>   3008,     15,      1,         0,               0.985
>   3008,     15,      1,         1,               0.985
>   3008,      1,     15,         0,               0.916
>   3008,      1,     15,         1,               0.916
>   3008,     47,      1,         0,               1.014
>   3008,     47,      1,         1,               1.014
>   3008,      1,     47,         0,               0.902
>   3008,      1,     47,         1,               0.902
>   3008,   2063,      1,         0,               0.981
>   3008,   2063,      1,         1,               0.981
>   3008,   2049,     15,         0,               0.912
>   3008,   2049,     15,         1,               0.913
>   3072,      0,      0,         0,               1.016
>   3072,      0,      0,         1,               1.015
>   3072,     16,      0,         0,               1.045
>   3072,     16,      0,         1,               1.045
>   3072,     48,      0,         0,               1.045
>   3072,     48,      0,         1,               1.045
>   3072,      0,     16,         0,               1.049
>   3072,      0,     16,         1,               1.049
>   3072,      0,     48,         0,               1.049
>   3072,      0,     48,         1,               1.049
>   3072,     16,     16,         0,               1.016
>   3072,     16,     16,         1,               1.016
>   3072,     48,     48,         0,               1.016
>   3072,     48,     48,         1,               1.016
>   3072,   2048,      0,         0,               1.016
>   3072,   2048,      0,         1,               1.016
>   3072,   2064,      0,         0,               1.045
>   3072,   2064,      0,         1,               1.045
>   3072,   2048,     16,         0,               1.049
>   3072,   2048,     16,         1,               1.049
>   3072,   2064,     16,         0,               1.016
>   3072,   2064,     16,         1,               1.016
>   3072,     16,      1,         0,               0.815
>   3072,     16,      1,         1,               0.815
>   3072,      1,     16,         0,               0.872
>   3072,      1,     16,         1,               0.872
>   3072,     48,      1,         0,               1.017
>   3072,     48,      1,         1,               1.017
>   3072,      1,     48,         0,               0.872
>   3072,      1,     48,         1,               0.872
>   3072,   2064,      1,         0,               0.815
>   3072,   2064,      1,         1,               0.815
>   3072,   2049,     16,         0,               0.872
>   3072,   2049,     16,         1,               0.872
>   3136,      0,      0,         0,               0.995
>   3136,      0,      0,         1,               0.995
>   3136,     17,      0,         0,               0.949
>   3136,     17,      0,         1,               0.949
>   3136,     49,      0,         0,               0.987
>   3136,     49,      0,         1,               0.987
>   3136,      0,     17,         0,               0.919
>   3136,      0,     17,         1,               0.917
>   3136,      0,     49,         0,               0.931
>   3136,      0,     49,         1,               0.931
>   3136,     17,     17,         0,               1.122
>   3136,     17,     17,         1,               1.119
>   3136,     49,     49,         0,               0.987
>   3136,     49,     49,         1,               0.987
>   3136,   2048,      0,         0,               0.997
>   3136,   2048,      0,         1,               0.997
>   3136,   2065,      0,         0,               0.949
>   3136,   2065,      0,         1,               0.949
>   3136,   2048,     17,         0,               0.896
>   3136,   2048,     17,         1,               0.896
>   3136,   2065,     17,         0,               1.122
>   3136,   2065,     17,         1,               1.119
>   3136,     17,      1,         0,               1.184
>   3136,     17,      1,         1,               1.184
>   3136,      1,     17,         0,               1.124
>   3136,      1,     17,         1,               1.125
>   3136,     49,      1,         0,                1.11
>   3136,     49,      1,         1,               1.108
>   3136,      1,     49,         0,               1.044
>   3136,      1,     49,         1,               1.044
>   3136,   2065,      1,         0,               1.147
>   3136,   2065,      1,         1,               1.147
>   3136,   2049,     17,         0,               1.102
>   3136,   2049,     17,         1,                 1.1
>   3200,      0,      0,         0,               1.006
>   3200,      0,      0,         1,               1.006
>   3200,     18,      0,         0,               0.978
>   3200,     18,      0,         1,               0.978
>   3200,     50,      0,         0,               0.998
>   3200,     50,      0,         1,               0.998
>   3200,      0,     18,         0,               0.932
>   3200,      0,     18,         1,               0.932
>   3200,      0,     50,         0,                0.93
>   3200,      0,     50,         1,                0.93
>   3200,     18,     18,         0,                1.11
>   3200,     18,     18,         1,                1.11
>   3200,     50,     50,         0,               0.994
>   3200,     50,     50,         1,               0.994
>   3200,   2048,      0,         0,               1.007
>   3200,   2048,      0,         1,               1.007
>   3200,   2066,      0,         0,               0.978
>   3200,   2066,      0,         1,               0.978
>   3200,   2048,     18,         0,               0.894
>   3200,   2048,     18,         1,               0.894
>   3200,   2066,     18,         0,                1.11
>   3200,   2066,     18,         1,                1.11
>   3200,     18,      1,         0,               1.002
>   3200,     18,      1,         1,               1.002
>   3200,      1,     18,         0,               0.917
>   3200,      1,     18,         1,               0.917
>   3200,     50,      1,         0,               0.963
>   3200,     50,      1,         1,               0.964
>   3200,      1,     50,         0,               0.888
>   3200,      1,     50,         1,               0.888
>   3200,   2066,      1,         0,               1.002
>   3200,   2066,      1,         1,               1.002
>   3200,   2049,     18,         0,               0.914
>   3200,   2049,     18,         1,               0.914
>   3264,      0,      0,         0,               0.994
>   3264,      0,      0,         1,               0.994
>   3264,     19,      0,         0,               0.959
>   3264,     19,      0,         1,               0.959
>   3264,     51,      0,         0,               0.994
>   3264,     51,      0,         1,               0.994
>   3264,      0,     19,         0,               0.927
>   3264,      0,     19,         1,               0.927
>   3264,      0,     51,         0,               0.927
>   3264,      0,     51,         1,               0.927
>   3264,     19,     19,         0,                 1.1
>   3264,     19,     19,         1,                 1.1
>   3264,     51,     51,         0,               0.982
>   3264,     51,     51,         1,               0.982
>   3264,   2048,      0,         0,               0.994
>   3264,   2048,      0,         1,               0.994
>   3264,   2067,      0,         0,               0.959
>   3264,   2067,      0,         1,               0.959
>   3264,   2048,     19,         0,               0.891
>   3264,   2048,     19,         1,               0.891
>   3264,   2067,     19,         0,               1.099
>   3264,   2067,     19,         1,               1.099
>   3264,     19,      1,         0,               0.977
>   3264,     19,      1,         1,               0.976
>   3264,      1,     19,         0,               0.921
>   3264,      1,     19,         1,               0.921
>   3264,     51,      1,         0,               0.959
>   3264,     51,      1,         1,               0.959
>   3264,      1,     51,         0,               0.886
>   3264,      1,     51,         1,               0.886
>   3264,   2067,      1,         0,               0.976
>   3264,   2067,      1,         1,               0.976
>   3264,   2049,     19,         0,               0.917
>   3264,   2049,     19,         1,               0.917
>   3328,      0,      0,         0,               0.996
>   3328,      0,      0,         1,               0.992
>   3328,     20,      0,         0,               0.955
>   3328,     20,      0,         1,               0.955
>   3328,     52,      0,         0,                0.99
>   3328,     52,      0,         1,                0.99
>   3328,      0,     20,         0,               0.926
>   3328,      0,     20,         1,               0.923
>   3328,      0,     52,         0,               0.933
>   3328,      0,     52,         1,               0.933
>   3328,     20,     20,         0,                1.11
>   3328,     20,     20,         1,                1.11
>   3328,     52,     52,         0,               0.988
>   3328,     52,     52,         1,               0.988
>   3328,   2048,      0,         0,               0.993
>   3328,   2048,      0,         1,               0.993
>   3328,   2068,      0,         0,               0.955
>   3328,   2068,      0,         1,               0.955
>   3328,   2048,     20,         0,                 0.9
>   3328,   2048,     20,         1,                 0.9
>   3328,   2068,     20,         0,               1.109
>   3328,   2068,     20,         1,               1.109
>   3328,     20,      1,         0,                0.99
>   3328,     20,      1,         1,                0.99
>   3328,      1,     20,         0,               0.922
>   3328,      1,     20,         1,               0.922
>   3328,     52,      1,         0,               0.972
>   3328,     52,      1,         1,               0.972
>   3328,      1,     52,         0,               0.901
>   3328,      1,     52,         1,               0.901
>   3328,   2068,      1,         0,                0.99
>   3328,   2068,      1,         1,                0.99
>   3328,   2049,     20,         0,               0.918
>   3328,   2049,     20,         1,               0.918
>   3392,      0,      0,         0,               0.998
>   3392,      0,      0,         1,                 1.0
>   3392,     21,      0,         0,               0.964
>   3392,     21,      0,         1,               0.964
>   3392,     53,      0,         0,               0.998
>   3392,     53,      0,         1,               0.998
>   3392,      0,     21,         0,               0.932
>   3392,      0,     21,         1,               0.932
>   3392,      0,     53,         0,                0.93
>   3392,      0,     53,         1,                0.93
>   3392,     21,     21,         0,               1.113
>   3392,     21,     21,         1,               1.113
>   3392,     53,     53,         0,               0.983
>   3392,     53,     53,         1,               0.983
>   3392,   2048,      0,         0,                 1.0
>   3392,   2048,      0,         1,                 1.0
>   3392,   2069,      0,         0,               0.964
>   3392,   2069,      0,         1,               0.964
>   3392,   2048,     21,         0,               0.895
>   3392,   2048,     21,         1,               0.896
>   3392,   2069,     21,         0,               1.113
>   3392,   2069,     21,         1,               1.113
>   3392,     21,      1,         0,               0.994
>   3392,     21,      1,         1,               0.994
>   3392,      1,     21,         0,               0.923
>   3392,      1,     21,         1,               0.923
>   3392,     53,      1,         0,               0.972
>   3392,     53,      1,         1,               0.972
>   3392,      1,     53,         0,               0.891
>   3392,      1,     53,         1,               0.891
>   3392,   2069,      1,         0,               0.994
>   3392,   2069,      1,         1,               0.994
>   3392,   2049,     21,         0,               0.922
>   3392,   2049,     21,         1,               0.922
>   3456,      0,      0,         0,               0.995
>   3456,      0,      0,         1,               0.995
>   3456,     22,      0,         0,               0.965
>   3456,     22,      0,         1,               0.965
>   3456,     54,      0,         0,               0.996
>   3456,     54,      0,         1,               0.996
>   3456,      0,     22,         0,               0.927
>   3456,      0,     22,         1,               0.927
>   3456,      0,     54,         0,               0.927
>   3456,      0,     54,         1,               0.927
>   3456,     22,     22,         0,               1.107
>   3456,     22,     22,         1,               1.107
>   3456,     54,     54,         0,                0.98
>   3456,     54,     54,         1,                0.98
>   3456,   2048,      0,         0,               0.995
>   3456,   2048,      0,         1,               0.995
>   3456,   2070,      0,         0,               0.965
>   3456,   2070,      0,         1,               0.965
>   3456,   2048,     22,         0,               0.893
>   3456,   2048,     22,         1,               0.893
>   3456,   2070,     22,         0,               1.107
>   3456,   2070,     22,         1,               1.107
>   3456,     22,      1,         0,               0.988
>   3456,     22,      1,         1,               0.988
>   3456,      1,     22,         0,               0.921
>   3456,      1,     22,         1,               0.921
>   3456,     54,      1,         0,               0.963
>   3456,     54,      1,         1,               0.963
>   3456,      1,     54,         0,               0.887
>   3456,      1,     54,         1,               0.887
>   3456,   2070,      1,         0,               0.988
>   3456,   2070,      1,         1,               0.988
>   3456,   2049,     22,         0,               0.917
>   3456,   2049,     22,         1,               0.917
>   3520,      0,      0,         0,               1.016
>   3520,      0,      0,         1,               1.016
>   3520,     23,      0,         0,               0.957
>   3520,     23,      0,         1,               0.957
>   3520,     55,      0,         0,               0.991
>   3520,     55,      0,         1,               0.991
>   3520,      0,     23,         0,               0.919
>   3520,      0,     23,         1,               0.924
>   3520,      0,     55,         0,               0.934
>   3520,      0,     55,         1,               0.934
>   3520,     23,     23,         0,               1.111
>   3520,     23,     23,         1,               1.111
>   3520,     55,     55,         0,               0.994
>   3520,     55,     55,         1,               0.994
>   3520,   2048,      0,         0,               1.016
>   3520,   2048,      0,         1,               1.016
>   3520,   2071,      0,         0,               0.957
>   3520,   2071,      0,         1,               0.957
>   3520,   2048,     23,         0,               0.903
>   3520,   2048,     23,         1,               0.903
>   3520,   2071,     23,         0,               1.111
>   3520,   2071,     23,         1,               1.111
>   3520,     23,      1,         0,               0.997
>   3520,     23,      1,         1,               0.997
>   3520,      1,     23,         0,               0.921
>   3520,      1,     23,         1,               0.921
>   3520,     55,      1,         0,               0.976
>   3520,     55,      1,         1,               0.976
>   3520,      1,     55,         0,               0.902
>   3520,      1,     55,         1,               0.902
>   3520,   2071,      1,         0,               0.997
>   3520,   2071,      1,         1,               0.997
>   3520,   2049,     23,         0,               0.918
>   3520,   2049,     23,         1,               0.918
>   3584,      0,      0,         0,               1.004
>   3584,      0,      0,         1,               1.004
>   3584,     24,      0,         0,               0.985
>   3584,     24,      0,         1,               0.979
>   3584,     56,      0,         0,               1.006
>   3584,     56,      0,         1,               1.006
>   3584,      0,     24,         0,               0.931
>   3584,      0,     24,         1,               0.931
>   3584,      0,     56,         0,                0.93
>   3584,      0,     56,         1,                0.93
>   3584,     24,     24,         0,               1.111
>   3584,     24,     24,         1,                1.11
>   3584,     56,     56,         0,               1.101
>   3584,     56,     56,         1,                 1.1
>   3584,   2048,      0,         0,               1.005
>   3584,   2048,      0,         1,               1.005
>   3584,   2072,      0,         0,                0.98
>   3584,   2072,      0,         1,               0.978
>   3584,   2048,     24,         0,               0.896
>   3584,   2048,     24,         1,               0.897
>   3584,   2072,     24,         0,               1.111
>   3584,   2072,     24,         1,               1.111
>   3584,     24,      1,         0,               1.004
>   3584,     24,      1,         1,               1.004
>   3584,      1,     24,         0,               0.921
>   3584,      1,     24,         1,               0.921
>   3584,     56,      1,         0,               0.971
>   3584,     56,      1,         1,                0.97
>   3584,      1,     56,         0,                0.89
>   3584,      1,     56,         1,                0.89
>   3584,   2072,      1,         0,               1.004
>   3584,   2072,      1,         1,               1.004
>   3584,   2049,     24,         0,               0.918
>   3584,   2049,     24,         1,               0.918
>   3648,      0,      0,         0,               1.012
>   3648,      0,      0,         1,               1.012
>   3648,     25,      0,         0,                0.96
>   3648,     25,      0,         1,                0.96
>   3648,     57,      0,         0,               0.988
>   3648,     57,      0,         1,               0.988
>   3648,      0,     25,         0,               0.927
>   3648,      0,     25,         1,               0.927
>   3648,      0,     57,         0,               0.927
>   3648,      0,     57,         1,               0.927
>   3648,     25,     25,         0,               1.101
>   3648,     25,     25,         1,               1.101
>   3648,     57,     57,         0,               0.986
>   3648,     57,     57,         1,               0.986
>   3648,   2048,      0,         0,               1.012
>   3648,   2048,      0,         1,               1.012
>   3648,   2073,      0,         0,                0.96
>   3648,   2073,      0,         1,               0.959
>   3648,   2048,     25,         0,               0.894
>   3648,   2048,     25,         1,               0.895
>   3648,   2073,     25,         0,               1.103
>   3648,   2073,     25,         1,               1.103
>   3648,     25,      1,         0,               1.024
>   3648,     25,      1,         1,               1.024
>   3648,      1,     25,         0,               0.911
>   3648,      1,     25,         1,               0.912
>   3648,     57,      1,         0,               0.973
>   3648,     57,      1,         1,               0.974
>   3648,      1,     57,         0,               0.888
>   3648,      1,     57,         1,               0.888
>   3648,   2073,      1,         0,               1.024
>   3648,   2073,      1,         1,               1.024
>   3648,   2049,     25,         0,               0.907
>   3648,   2049,     25,         1,               0.907
>   3712,      0,      0,         0,               0.996
>   3712,      0,      0,         1,               0.996
>   3712,     26,      0,         0,                0.96
>   3712,     26,      0,         1,                0.96
>   3712,     58,      0,         0,               0.995
>   3712,     58,      0,         1,               0.995
>   3712,      0,     26,         0,               0.919
>   3712,      0,     26,         1,               0.918
>   3712,      0,     58,         0,                0.93
>   3712,      0,     58,         1,                0.93
>   3712,     26,     26,         0,               1.103
>   3712,     26,     26,         1,               1.102
>   3712,     58,     58,         0,               0.989
>   3712,     58,     58,         1,               0.989
>   3712,   2048,      0,         0,               0.997
>   3712,   2048,      0,         1,               0.997
>   3712,   2074,      0,         0,               0.959
>   3712,   2074,      0,         1,               0.959
>   3712,   2048,     26,         0,               0.901
>   3712,   2048,     26,         1,               0.901
>   3712,   2074,     26,         0,               1.104
>   3712,   2074,     26,         1,               1.102
>   3712,     26,      1,         0,               1.001
>   3712,     26,      1,         1,               1.001
>   3712,      1,     26,         0,               0.922
>   3712,      1,     26,         1,               0.922
>   3712,     58,      1,         0,               0.974
>   3712,     58,      1,         1,               0.974
>   3712,      1,     58,         0,               0.903
>   3712,      1,     58,         1,               0.903
>   3712,   2074,      1,         0,               1.001
>   3712,   2074,      1,         1,               1.001
>   3712,   2049,     26,         0,               0.919
>   3712,   2049,     26,         1,               0.919
>   3776,      0,      0,         0,               1.003
>   3776,      0,      0,         1,               1.003
>   3776,     27,      0,         0,               0.964
>   3776,     27,      0,         1,               0.964
>   3776,     59,      0,         0,               1.004
>   3776,     59,      0,         1,               1.004
>   3776,      0,     27,         0,               0.931
>   3776,      0,     27,         1,               0.931
>   3776,      0,     59,         0,               0.929
>   3776,      0,     59,         1,                0.93
>   3776,     27,     27,         0,               1.097
>   3776,     27,     27,         1,               1.097
>   3776,     59,     59,         0,               0.992
>   3776,     59,     59,         1,               0.992
>   3776,   2048,      0,         0,               1.003
>   3776,   2048,      0,         1,               1.003
>   3776,   2075,      0,         0,               0.963
>   3776,   2075,      0,         1,               0.964
>   3776,   2048,     27,         0,               0.898
>   3776,   2048,     27,         1,               0.898
>   3776,   2075,     27,         0,               1.097
>   3776,   2075,     27,         1,               1.097
>   3776,     27,      1,         0,               0.998
>   3776,     27,      1,         1,               0.998
>   3776,      1,     27,         0,               0.925
>   3776,      1,     27,         1,               0.925
>   3776,     59,      1,         0,               0.979
>   3776,     59,      1,         1,               0.979
>   3776,      1,     59,         0,               0.894
>   3776,      1,     59,         1,               0.894
>   3776,   2075,      1,         0,               0.998
>   3776,   2075,      1,         1,               0.999
>   3776,   2049,     27,         0,               0.923
>   3776,   2049,     27,         1,               0.923
>   3840,      0,      0,         0,               0.997
>   3840,      0,      0,         1,               0.997
>   3840,     28,      0,         0,               0.968
>   3840,     28,      0,         1,               0.968
>   3840,     60,      0,         0,               1.001
>   3840,     60,      0,         1,               1.001
>   3840,      0,     28,         0,               0.926
>   3840,      0,     28,         1,               0.927
>   3840,      0,     60,         0,               0.927
>   3840,      0,     60,         1,               0.927
>   3840,     28,     28,         0,               1.094
>   3840,     28,     28,         1,               1.094
>   3840,     60,     60,         0,               0.982
>   3840,     60,     60,         1,               0.982
>   3840,   2048,      0,         0,               0.998
>   3840,   2048,      0,         1,               0.998
>   3840,   2076,      0,         0,               0.968
>   3840,   2076,      0,         1,               0.968
>   3840,   2048,     28,         0,               0.896
>   3840,   2048,     28,         1,               0.896
>   3840,   2076,     28,         0,               1.094
>   3840,   2076,     28,         1,               1.094
>   3840,     28,      1,         0,               0.983
>   3840,     28,      1,         1,               0.982
>   3840,      1,     28,         0,               0.916
>   3840,      1,     28,         1,               0.916
>   3840,     60,      1,         0,               0.969
>   3840,     60,      1,         1,               0.969
>   3840,      1,     60,         0,               0.891
>   3840,      1,     60,         1,               0.891
>   3840,   2076,      1,         0,               0.983
>   3840,   2076,      1,         1,               0.983
>   3840,   2049,     28,         0,               0.912
>   3840,   2049,     28,         1,               0.912
>   3904,      0,      0,         0,               1.002
>   3904,      0,      0,         1,                 1.0
>   3904,     29,      0,         0,               0.961
>   3904,     29,      0,         1,               0.961
>   3904,     61,      0,         0,               0.997
>   3904,     61,      0,         1,               0.997
>   3904,      0,     29,         0,               0.915
>   3904,      0,     29,         1,               0.922
>   3904,      0,     61,         0,               0.933
>   3904,      0,     61,         1,               0.933
>   3904,     29,     29,         0,               1.103
>   3904,     29,     29,         1,               1.103
>   3904,     61,     61,         0,               0.995
>   3904,     61,     61,         1,               0.995
>   3904,   2048,      0,         0,               0.998
>   3904,   2048,      0,         1,                 1.0
>   3904,   2077,      0,         0,               0.961
>   3904,   2077,      0,         1,               0.961
>   3904,   2048,     29,         0,               0.904
>   3904,   2048,     29,         1,               0.904
>   3904,   2077,     29,         0,               1.103
>   3904,   2077,     29,         1,               1.103
>   3904,     29,      1,         0,                 1.0
>   3904,     29,      1,         1,                 1.0
>   3904,      1,     29,         0,               0.922
>   3904,      1,     29,         1,               0.922
>   3904,     61,      1,         0,                0.98
>   3904,     61,      1,         1,                0.98
>   3904,      1,     61,         0,               0.904
>   3904,      1,     61,         1,               0.904
>   3904,   2077,      1,         0,                 1.0
>   3904,   2077,      1,         1,                 1.0
>   3904,   2049,     29,         0,               0.919
>   3904,   2049,     29,         1,               0.919
>   3968,      0,      0,         0,               1.003
>   3968,      0,      0,         1,               1.003
>   3968,     30,      0,         0,               0.969
>   3968,     30,      0,         1,               0.969
>   3968,     62,      0,         0,               1.006
>   3968,     62,      0,         1,               1.006
>   3968,      0,     30,         0,               0.931
>   3968,      0,     30,         1,                0.93
>   3968,      0,     62,         0,               0.929
>   3968,      0,     62,         1,               0.929
>   3968,     30,     30,         0,               1.103
>   3968,     30,     30,         1,               1.103
>   3968,     62,     62,         0,                0.99
>   3968,     62,     62,         1,                0.99
>   3968,   2048,      0,         0,               1.004
>   3968,   2048,      0,         1,               1.004
>   3968,   2078,      0,         0,               0.969
>   3968,   2078,      0,         1,               0.969
>   3968,   2048,     30,         0,               0.899
>   3968,   2048,     30,         1,               0.899
>   3968,   2078,     30,         0,               1.105
>   3968,   2078,     30,         1,               1.105
>   3968,     30,      1,         0,               0.993
>   3968,     30,      1,         1,               0.993
>   3968,      1,     30,         0,               0.908
>   3968,      1,     30,         1,               0.908
>   3968,     62,      1,         0,               0.978
>   3968,     62,      1,         1,               0.978
>   3968,      1,     62,         0,               0.895
>   3968,      1,     62,         1,               0.895
>   3968,   2078,      1,         0,               0.993
>   3968,   2078,      1,         1,               0.993
>   3968,   2049,     30,         0,               0.904
>   3968,   2049,     30,         1,               0.904
>   4032,      0,      0,         0,               0.995
>   4032,      0,      0,         1,               0.995
>   4032,     31,      0,         0,               0.967
>   4032,     31,      0,         1,               0.967
>   4032,     63,      0,         0,               1.002
>   4032,     63,      0,         1,               1.002
>   4032,      0,     31,         0,               0.927
>   4032,      0,     31,         1,               0.926
>   4032,      0,     63,         0,               0.927
>   4032,      0,     63,         1,               0.927
>   4032,     31,     31,         0,                1.09
>   4032,     31,     31,         1,                1.09
>   4032,     63,     63,         0,               0.987
>   4032,     63,     63,         1,               0.987
>   4032,   2048,      0,         0,               0.995
>   4032,   2048,      0,         1,               0.995
>   4032,   2079,      0,         0,               0.967
>   4032,   2079,      0,         1,               0.967
>   4032,   2048,     31,         0,               0.897
>   4032,   2048,     31,         1,               0.897
>   4032,   2079,     31,         0,                1.09
>   4032,   2079,     31,         1,                1.09
>   4032,     31,      1,         0,               0.989
>   4032,     31,      1,         1,               0.989
>   4032,      1,     31,         0,               0.911
>   4032,      1,     31,         1,               0.911
>   4032,     63,      1,         0,               0.971
>   4032,     63,      1,         1,               0.972
>   4032,      1,     63,         0,               0.892
>   4032,      1,     63,         1,               0.892
>   4032,   2079,      1,         0,               0.989
>   4032,   2079,      1,         1,               0.989
>   4032,   2049,     31,         0,               0.907
>   4032,   2049,     31,         1,               0.907
>   4096,     32,      0,         0,               1.014
>   4096,     32,      0,         1,               1.014
>   4096,     64,      0,         0,               1.014
>   4096,     64,      0,         1,               1.014
>   4096,      0,     32,         0,               1.012
>   4096,      0,     32,         1,               1.012
>   4096,      0,     64,         0,               1.012
>   4096,      0,     64,         1,               1.012
>   4096,     32,     32,         0,               1.014
>   4096,     32,     32,         1,               1.014
>   4096,     64,     64,         0,               1.014
>   4096,     64,     64,         1,               1.014
>   4096,   2080,      0,         0,               1.014
>   4096,   2080,      0,         1,               1.014
>   4096,   2048,     32,         0,               1.014
>   4096,   2048,     32,         1,               1.014
>   4096,   2080,     32,         0,               1.014
>   4096,   2080,     32,         1,               1.014
>   4096,     32,      1,         0,               0.975
>   4096,     32,      1,         1,               0.975
>   4096,      1,     32,         0,               0.769
>   4096,      1,     32,         1,               0.769
>   4096,     64,      1,         0,               0.858
>   4096,     64,      1,         1,               0.858
>   4096,      1,     64,         0,               0.769
>   4096,      1,     64,         1,               0.769
>   4096,   2080,      1,         0,               0.829
>   4096,   2080,      1,         1,               0.829
>   4096,   2049,     32,         0,               0.886
>   4096,   2049,     32,         1,               0.886
>   4160,      0,      0,         0,               1.003
>   4160,      0,      0,         1,               1.003
>   4160,     33,      0,         0,               1.004
>   4160,     33,      0,         1,               1.004
>   4160,     65,      0,         0,               0.999
>   4160,     65,      0,         1,               0.999
>   4160,      0,     33,         0,               0.931
>   4160,      0,     33,         1,               0.931
>   4160,      0,     65,         0,               0.765
>   4160,      0,     65,         1,               0.765
>   4160,     33,     33,         0,               0.998
>   4160,     33,     33,         1,               0.998
>   4160,     65,     65,         0,               0.942
>   4160,     65,     65,         1,               0.942
>   4160,   2048,      0,         0,               1.003
>   4160,   2048,      0,         1,               1.003
>   4160,   2081,      0,         0,               1.004
>   4160,   2081,      0,         1,               1.004
>   4160,   2048,     33,         0,               0.899
>   4160,   2048,     33,         1,               0.898
>   4160,   2081,     33,         0,               1.002
>   4160,   2081,     33,         1,               1.002
>   4160,     33,      1,         0,               1.114
>   4160,     33,      1,         1,               1.114
>   4160,      1,     33,         0,                1.01
>   4160,      1,     33,         1,                1.01
>   4160,     65,      1,         0,               1.077
>   4160,     65,      1,         1,               1.077
>   4160,      1,     65,         0,               0.935
>   4160,      1,     65,         1,               0.935
>   4160,   2081,      1,         0,               1.077
>   4160,   2081,      1,         1,               1.077
>   4160,   2049,     33,         0,               1.007
>   4160,   2049,     33,         1,               1.007
>   4224,      0,      0,         0,               1.014
>   4224,      0,      0,         1,               1.014
>   4224,     34,      0,         0,                 1.0
>   4224,     34,      0,         1,                 1.0
>   4224,     66,      0,         0,               1.001
>   4224,     66,      0,         1,               1.001
>   4224,      0,     34,         0,               0.928
>   4224,      0,     34,         1,               0.928
>   4224,      0,     66,         0,               0.762
>   4224,      0,     66,         1,               0.762
>   4224,     34,     34,         0,               0.998
>   4224,     34,     34,         1,               0.998
>   4224,     66,     66,         0,               0.959
>   4224,     66,     66,         1,               0.959
>   4224,   2048,      0,         0,               1.014
>   4224,   2048,      0,         1,               1.014
>   4224,   2082,      0,         0,               1.001
>   4224,   2082,      0,         1,               1.001
>   4224,   2048,     34,         0,               0.899
>   4224,   2048,     34,         1,               0.898
>   4224,   2082,     34,         0,               0.998
>   4224,   2082,     34,         1,               0.998
>   4224,     34,      1,         0,               1.024
>   4224,     34,      1,         1,               1.023
>   4224,      1,     34,         0,               0.917
>   4224,      1,     34,         1,               0.917
>   4224,     66,      1,         0,               1.012
>   4224,     66,      1,         1,               1.013
>   4224,      1,     66,         0,               0.917
>   4224,      1,     66,         1,               0.917
>   4224,   2082,      1,         0,               1.022
>   4224,   2082,      1,         1,               1.022
>   4224,   2049,     34,         0,               0.914
>   4224,   2049,     34,         1,               0.914
>   4288,      0,      0,         0,               0.999
>   4288,      0,      0,         1,               0.999
>   4288,     35,      0,         0,               0.995
>   4288,     35,      0,         1,               0.996
>   4288,     67,      0,         0,               0.998
>   4288,     67,      0,         1,               0.998
>   4288,      0,     35,         0,               0.919
>   4288,      0,     35,         1,               0.918
>   4288,      0,     67,         0,               0.767
>   4288,      0,     67,         1,               0.767
>   4288,     35,     35,         0,               1.005
>   4288,     35,     35,         1,               1.004
>   4288,     67,     67,         0,               0.995
>   4288,     67,     67,         1,               0.995
>   4288,   2048,      0,         0,               0.999
>   4288,   2048,      0,         1,               0.999
>   4288,   2083,      0,         0,               0.995
>   4288,   2083,      0,         1,               0.995
>   4288,   2048,     35,         0,               0.905
>   4288,   2048,     35,         1,               0.904
>   4288,   2083,     35,         0,               1.005
>   4288,   2083,     35,         1,               1.004
>   4288,     35,      1,         0,               1.033
>   4288,     35,      1,         1,               1.032
>   4288,      1,     35,         0,               0.928
>   4288,      1,     35,         1,               0.928
>   4288,     67,      1,         0,               1.019
>   4288,     67,      1,         1,                1.02
>   4288,      1,     67,         0,               0.925
>   4288,      1,     67,         1,               0.924
>   4288,   2083,      1,         0,                1.03
>   4288,   2083,      1,         1,                1.03
>   4288,   2049,     35,         0,               0.925
>   4288,   2049,     35,         1,               0.926
>   4352,      0,      0,         0,               1.005
>   4352,      0,      0,         1,               1.005
>   4352,     36,      0,         0,               1.007
>   4352,     36,      0,         1,               1.006
>   4352,     68,      0,         0,               1.007
>   4352,     68,      0,         1,               1.008
>   4352,      0,     36,         0,               0.929
>   4352,      0,     36,         1,               0.929
>   4352,      0,     68,         0,               0.766
>   4352,      0,     68,         1,               0.766
>   4352,     36,     36,         0,               0.998
>   4352,     36,     36,         1,               0.998
>   4352,     68,     68,         0,               0.964
>   4352,     68,     68,         1,               0.964
>   4352,   2048,      0,         0,               1.006
>   4352,   2048,      0,         1,               1.006
>   4352,   2084,      0,         0,               1.006
>   4352,   2084,      0,         1,               1.006
>   4352,   2048,     36,         0,               0.897
>   4352,   2048,     36,         1,               0.898
>   4352,   2084,     36,         0,               0.998
>   4352,   2084,     36,         1,               0.998
>   4352,     36,      1,         0,               1.031
>   4352,     36,      1,         1,               1.031
>   4352,      1,     36,         0,               0.924
>   4352,      1,     36,         1,               0.924
>   4352,     68,      1,         0,               0.999
>   4352,     68,      1,         1,               0.999
>   4352,      1,     68,         0,               0.922
>   4352,      1,     68,         1,               0.922
>   4352,   2084,      1,         0,                1.03
>   4352,   2084,      1,         1,                1.03
>   4352,   2049,     36,         0,               0.922
>   4352,   2049,     36,         1,               0.922
>   4416,      0,      0,         0,               0.997
>   4416,      0,      0,         1,               0.997
>   4416,     37,      0,         0,               1.002
>   4416,     37,      0,         1,               1.002
>   4416,     69,      0,         0,               1.004
>   4416,     69,      0,         1,               1.004
>   4416,      0,     37,         0,               0.928
>   4416,      0,     37,         1,               0.927
>   4416,      0,     69,         0,               0.762
>   4416,      0,     69,         1,               0.762
>   4416,     37,     37,         0,               0.994
>   4416,     37,     37,         1,               0.994
>   4416,     69,     69,         0,               0.959
>   4416,     69,     69,         1,               0.959
>   4416,   2048,      0,         0,               0.997
>   4416,   2048,      0,         1,               0.997
>   4416,   2085,      0,         0,               1.001
>   4416,   2085,      0,         1,               1.001
>   4416,   2048,     37,         0,               0.899
>   4416,   2048,     37,         1,               0.899
>   4416,   2085,     37,         0,               0.994
>   4416,   2085,     37,         1,               0.994
>   4416,     37,      1,         0,               1.024
>   4416,     37,      1,         1,               1.023
>   4416,      1,     37,         0,               0.923
>   4416,      1,     37,         1,               0.922
>   4416,     69,      1,         0,               1.009
>   4416,     69,      1,         1,                1.01
>   4416,      1,     69,         0,               0.917
>   4416,      1,     69,         1,               0.917
>   4416,   2085,      1,         0,               1.024
>   4416,   2085,      1,         1,               1.024
>   4416,   2049,     37,         0,               0.919
>   4416,   2049,     37,         1,               0.919
>   4480,      0,      0,         0,                 1.0
>   4480,      0,      0,         1,               0.999
>   4480,     38,      0,         0,               0.996
>   4480,     38,      0,         1,               0.996
>   4480,     70,      0,         0,                 1.0
>   4480,     70,      0,         1,                 1.0
>   4480,      0,     38,         0,               0.919
>   4480,      0,     38,         1,               0.921
>   4480,      0,     70,         0,               0.767
>   4480,      0,     70,         1,               0.767
>   4480,     38,     38,         0,               1.002
>   4480,     38,     38,         1,               1.002
>   4480,     70,     70,         0,               0.963
>   4480,     70,     70,         1,               0.963
>   4480,   2048,      0,         0,               0.998
>   4480,   2048,      0,         1,               0.999
>   4480,   2086,      0,         0,               0.996
>   4480,   2086,      0,         1,               0.995
>   4480,   2048,     38,         0,               0.907
>   4480,   2048,     38,         1,               0.907
>   4480,   2086,     38,         0,               1.002
>   4480,   2086,     38,         1,               1.002
>   4480,     38,      1,         0,               1.032
>   4480,     38,      1,         1,               1.031
>   4480,      1,     38,         0,               0.919
>   4480,      1,     38,         1,                0.92
>   4480,     70,      1,         0,               1.018
>   4480,     70,      1,         1,               1.017
>   4480,      1,     70,         0,               0.916
>   4480,      1,     70,         1,               0.915
>   4480,   2086,      1,         0,               1.031
>   4480,   2086,      1,         1,                1.03
>   4480,   2049,     38,         0,               0.917
>   4480,   2049,     38,         1,               0.918
>   4544,      0,      0,         0,               1.002
>   4544,      0,      0,         1,               1.002
>   4544,     39,      0,         0,               1.007
>   4544,     39,      0,         1,               1.008
>   4544,     71,      0,         0,               1.002
>   4544,     71,      0,         1,               1.002
>   4544,      0,     39,         0,                0.93
>   4544,      0,     39,         1,               0.931
>   4544,      0,     71,         0,               0.766
>   4544,      0,     71,         1,               0.766
>   4544,     39,     39,         0,               1.001
>   4544,     39,     39,         1,               1.001
>   4544,     71,     71,         0,               0.966
>   4544,     71,     71,         1,               0.966
>   4544,   2048,      0,         0,               1.002
>   4544,   2048,      0,         1,               1.002
>   4544,   2087,      0,         0,               1.008
>   4544,   2087,      0,         1,               1.007
>   4544,   2048,     39,         0,               0.901
>   4544,   2048,     39,         1,               0.901
>   4544,   2087,     39,         0,               1.001
>   4544,   2087,     39,         1,               1.001
>   4544,     39,      1,         0,               1.025
>   4544,     39,      1,         1,               1.025
>   4544,      1,     39,         0,               0.919
>   4544,      1,     39,         1,               0.919
>   4544,     71,      1,         0,               0.991
>   4544,     71,      1,         1,               0.991
>   4544,      1,     71,         0,               0.921
>   4544,      1,     71,         1,               0.922
>   4544,   2087,      1,         0,               1.025
>   4544,   2087,      1,         1,               1.025
>   4544,   2049,     39,         0,               0.917
>   4544,   2049,     39,         1,               0.917
>   4608,      0,      0,         0,               0.997
>   4608,      0,      0,         1,               0.997
>   4608,     40,      0,         0,               1.013
>   4608,     40,      0,         1,               1.013
>   4608,     72,      0,         0,               1.013
>   4608,     72,      0,         1,               1.013
>   4608,      0,     40,         0,               0.925
>   4608,      0,     40,         1,               0.926
>   4608,      0,     72,         0,               0.765
>   4608,      0,     72,         1,               0.765
>   4608,     40,     40,         0,               1.084
>   4608,     40,     40,         1,               1.084
>   4608,     72,     72,         0,               0.966
>   4608,     72,     72,         1,               0.966
>   4608,   2048,      0,         0,               0.999
>   4608,   2048,      0,         1,               0.999
>   4608,   2088,      0,         0,               1.012
>   4608,   2088,      0,         1,               1.012
>   4608,   2048,     40,         0,               0.898
>   4608,   2048,     40,         1,               0.898
>   4608,   2088,     40,         0,               1.087
>   4608,   2088,     40,         1,               1.087
>   4608,     40,      1,         0,               1.006
>   4608,     40,      1,         1,               1.006
>   4608,      1,     40,         0,               0.926
>   4608,      1,     40,         1,               0.925
>   4608,     72,      1,         0,               1.012
>   4608,     72,      1,         1,               1.011
>   4608,      1,     72,         0,                0.92
>   4608,      1,     72,         1,                0.92
>   4608,   2088,      1,         0,               1.006
>   4608,   2088,      1,         1,               1.006
>   4608,   2049,     40,         0,               0.923
>   4608,   2049,     40,         1,               0.923
>   4672,      0,      0,         0,               1.014
>   4672,      0,      0,         1,               1.014
>   4672,     41,      0,         0,               1.003
>   4672,     41,      0,         1,               1.003
>   4672,     73,      0,         0,               0.983
>   4672,     73,      0,         1,               0.982
>   4672,      0,     41,         0,               0.916
>   4672,      0,     41,         1,               0.918
>   4672,      0,     73,         0,               0.772
>   4672,      0,     73,         1,               0.772
>   4672,     41,     41,         0,               1.012
>   4672,     41,     41,         1,               1.012
>   4672,     73,     73,         0,               0.973
>   4672,     73,     73,         1,               0.973
>   4672,   2048,      0,         0,               1.014
>   4672,   2048,      0,         1,               1.014
>   4672,   2089,      0,         0,               1.002
>   4672,   2089,      0,         1,               1.002
>   4672,   2048,     41,         0,               0.907
>   4672,   2048,     41,         1,               0.908
>   4672,   2089,     41,         0,               1.012
>   4672,   2089,     41,         1,               1.012
>   4672,     41,      1,         0,               1.027
>   4672,     41,      1,         1,               1.027
>   4672,      1,     41,         0,               0.928
>   4672,      1,     41,         1,               0.927
>   4672,     73,      1,         0,               1.032
>   4672,     73,      1,         1,                1.03
>   4672,      1,     73,         0,               0.927
>   4672,      1,     73,         1,               0.927
>   4672,   2089,      1,         0,               1.026
>   4672,   2089,      1,         1,               1.027
>   4672,   2049,     41,         0,               0.925
>   4672,   2049,     41,         1,               0.925
>   4736,      0,      0,         0,               1.005
>   4736,      0,      0,         1,               1.005
>   4736,     42,      0,         0,               1.012
>   4736,     42,      0,         1,               1.012
>   4736,     74,      0,         0,               0.976
>   4736,     74,      0,         1,               0.975
>   4736,      0,     42,         0,                0.93
>   4736,      0,     42,         1,                0.93
>   4736,      0,     74,         0,                0.77
>   4736,      0,     74,         1,                0.77
>   4736,     42,     42,         0,               1.007
>   4736,     42,     42,         1,               1.007
>   4736,     74,     74,         0,               0.965
>   4736,     74,     74,         1,               0.965
>   4736,   2048,      0,         0,               1.006
>   4736,   2048,      0,         1,               1.006
>   4736,   2090,      0,         0,               1.013
>   4736,   2090,      0,         1,               1.013
>   4736,   2048,     42,         0,               0.902
>   4736,   2048,     42,         1,               0.902
>   4736,   2090,     42,         0,               1.007
>   4736,   2090,     42,         1,               1.007
>   4736,     42,      1,         0,               1.032
>   4736,     42,      1,         1,               1.032
>   4736,      1,     42,         0,               0.925
>   4736,      1,     42,         1,               0.925
>   4736,     74,      1,         0,               1.018
>   4736,     74,      1,         1,               1.018
>   4736,      1,     74,         0,               0.912
>   4736,      1,     74,         1,               0.912
>   4736,   2090,      1,         0,               1.032
>   4736,   2090,      1,         1,               1.032
>   4736,   2049,     42,         0,               0.923
>   4736,   2049,     42,         1,               0.923
>   4800,      0,      0,         0,               1.012
>   4800,      0,      0,         1,               1.012
>   4800,     43,      0,         0,               1.008
>   4800,     43,      0,         1,               1.008
>   4800,     75,      0,         0,                0.99
>   4800,     75,      0,         1,                0.99
>   4800,      0,     43,         0,               0.928
>   4800,      0,     43,         1,               0.928
>   4800,      0,     75,         0,               0.767
>   4800,      0,     75,         1,               0.768
>   4800,     43,     43,         0,               1.004
>   4800,     43,     43,         1,               1.004
>   4800,     75,     75,         0,               0.965
>   4800,     75,     75,         1,               0.965
>   4800,   2048,      0,         0,               1.012
>   4800,   2048,      0,         1,               1.012
>   4800,   2091,      0,         0,               1.009
>   4800,   2091,      0,         1,               1.008
>   4800,   2048,     43,         0,               0.902
>   4800,   2048,     43,         1,               0.902
>   4800,   2091,     43,         0,               1.004
>   4800,   2091,     43,         1,               1.004
>   4800,     43,      1,         0,               1.026
>   4800,     43,      1,         1,               1.025
>   4800,      1,     43,         0,                0.91
>   4800,      1,     43,         1,                0.91
>   4800,     75,      1,         0,               0.992
>   4800,     75,      1,         1,               0.992
>   4800,      1,     75,         0,               0.921
>   4800,      1,     75,         1,                0.92
>   4800,   2091,      1,         0,               1.025
>   4800,   2091,      1,         1,               1.025
>   4800,   2049,     43,         0,               0.907
>   4800,   2049,     43,         1,               0.907
>   4864,      0,      0,         0,               0.998
>   4864,      0,      0,         1,               0.998
>   4864,     44,      0,         0,               1.003
>   4864,     44,      0,         1,               1.004
>   4864,     76,      0,         0,               0.987
>   4864,     76,      0,         1,               0.987
>   4864,      0,     44,         0,                0.92
>   4864,      0,     44,         1,               0.921
>   4864,      0,     76,         0,               0.933
>   4864,      0,     76,         1,               0.932
>   4864,     44,     44,         0,               1.006
>   4864,     44,     44,         1,               1.004
>   4864,     76,     76,         0,               0.976
>   4864,     76,     76,         1,               0.975
>   4864,   2048,      0,         0,               0.999
>   4864,   2048,      0,         1,               0.999
>   4864,   2092,      0,         0,               1.004
>   4864,   2092,      0,         1,               1.005
>   4864,   2048,     44,         0,               0.907
>   4864,   2048,     44,         1,               0.907
>   4864,   2092,     44,         0,               1.006
>   4864,   2092,     44,         1,               1.005
>   4864,     44,      1,         0,               1.034
>   4864,     44,      1,         1,               1.032
>   4864,      1,     44,         0,               0.908
>   4864,      1,     44,         1,               0.929
>   4864,     76,      1,         0,               1.006
>   4864,     76,      1,         1,               1.005
>   4864,      1,     76,         0,               0.798
>   4864,      1,     76,         1,               0.798
>   4864,   2092,      1,         0,               1.033
>   4864,   2092,      1,         1,               1.033
>   4864,   2049,     44,         0,               0.904
>   4864,   2049,     44,         1,               0.925
>   4928,      0,      0,         0,               1.005
>   4928,      0,      0,         1,               1.005
>   4928,     45,      0,         0,               0.993
>   4928,     45,      0,         1,               1.012
>   4928,     77,      0,         0,               0.956
>   4928,     77,      0,         1,               0.976
>   4928,      0,     45,         0,               0.933
>   4928,      0,     45,         1,               0.932
>   4928,      0,     77,         0,               0.771
>   4928,      0,     77,         1,               0.771
>   4928,     45,     45,         0,               1.015
>   4928,     45,     45,         1,               1.015
>   4928,     77,     77,         0,               0.972
>   4928,     77,     77,         1,               0.972
>   4928,   2048,      0,         0,               1.005
>   4928,   2048,      0,         1,               1.005
>   4928,   2093,      0,         0,               0.992
>   4928,   2093,      0,         1,               1.012
>   4928,   2048,     45,         0,               0.932
>   4928,   2048,     45,         1,               0.931
>   4928,   2093,     45,         0,               1.015
>   4928,   2093,     45,         1,               1.015
>   4928,     45,      1,         0,               1.009
>   4928,     45,      1,         1,               1.032
>   4928,      1,     45,         0,               0.806
>   4928,      1,     45,         1,               0.805
>   4928,     77,      1,         0,               0.981
>   4928,     77,      1,         1,               1.005
>   4928,      1,     77,         0,               0.917
>   4928,      1,     77,         1,               0.917
>   4928,   2093,      1,         0,               1.008
>   4928,   2093,      1,         1,               1.032
>   4928,   2049,     45,         0,               0.794
>   4928,   2049,     45,         1,               0.794
>   4992,      0,      0,         0,               0.999
>   4992,      0,      0,         1,               0.999
>   4992,     46,      0,         0,               0.985
>   4992,     46,      0,         1,               1.008
>   4992,     78,      0,         0,               0.963
>   4992,     78,      0,         1,               0.984
>   4992,      0,     46,         0,               0.908
>   4992,      0,     46,         1,               0.908
>   4992,      0,     78,         0,               0.752
>   4992,      0,     78,         1,               0.751
>   4992,     46,     46,         0,               0.997
>   4992,     46,     46,         1,               0.997
>   4992,     78,     78,         0,               0.969
>   4992,     78,     78,         1,               0.968
>   4992,   2048,      0,         0,                 1.0
>   4992,   2048,      0,         1,                 1.0
>   4992,   2094,      0,         0,               0.987
>   4992,   2094,      0,         1,               1.008
>   4992,   2048,     46,         0,               0.883
>   4992,   2048,     46,         1,               0.883
>   4992,   2094,     46,         0,               0.997
>   4992,   2094,     46,         1,               0.997
>   4992,     46,      1,         0,               0.998
>   4992,     46,      1,         1,                1.02
>   4992,      1,     46,         0,               0.917
>   4992,      1,     46,         1,               0.917
>   4992,     78,      1,         0,               0.972
>   4992,     78,      1,         1,               0.993
>   4992,      1,     78,         0,               0.919
>   4992,      1,     78,         1,                0.92
>   4992,   2094,      1,         0,               0.997
>   4992,   2094,      1,         1,               1.019
>   4992,   2049,     46,         0,               0.914
>   4992,   2049,     46,         1,               0.914
>   5056,      0,      0,         0,               1.002
>   5056,      0,      0,         1,                 1.0
>   5056,     47,      0,         0,               1.005
>   5056,     47,      0,         1,               1.005
>   5056,     79,      0,         0,               0.989
>   5056,     79,      0,         1,               0.989
>   5056,      0,     47,         0,               0.918
>   5056,      0,     47,         1,               0.919
>   5056,      0,     79,         0,               0.772
>   5056,      0,     79,         1,               0.771
>   5056,     47,     47,         0,               1.006
>   5056,     47,     47,         1,               1.006
>   5056,     79,     79,         0,               0.972
>   5056,     79,     79,         1,               0.972
>   5056,   2048,      0,         0,               1.001
>   5056,   2048,      0,         1,                 1.0
>   5056,   2095,      0,         0,               1.004
>   5056,   2095,      0,         1,               1.004
>   5056,   2048,     47,         0,               0.908
>   5056,   2048,     47,         1,               0.909
>   5056,   2095,     47,         0,               1.006
>   5056,   2095,     47,         1,               1.006
>   5056,     47,      1,         0,               1.033
>   5056,     47,      1,         1,               1.033
>   5056,      1,     47,         0,               0.919
>   5056,      1,     47,         1,               0.919
>   5056,     79,      1,         0,               1.003
>   5056,     79,      1,         1,               1.005
>   5056,      1,     79,         0,               0.921
>   5056,      1,     79,         1,               0.921
>   5056,   2095,      1,         0,               1.032
>   5056,   2095,      1,         1,               1.034
>   5056,   2049,     47,         0,               0.918
>   5056,   2049,     47,         1,               0.917
>   5120,      0,      0,         0,               1.003
>   5120,      0,      0,         1,               1.003
>   5120,     48,      0,         0,               1.068
>   5120,     48,      0,         1,               1.068
>   5120,     80,      0,         0,               1.068
>   5120,     80,      0,         1,               1.068
>   5120,      0,     48,         0,               1.065
>   5120,      0,     48,         1,               1.065
>   5120,      0,     80,         0,               1.064
>   5120,      0,     80,         1,               1.065
>   5120,     48,     48,         0,               1.004
>   5120,     48,     48,         1,               1.004
>   5120,     80,     80,         0,               1.005
>   5120,     80,     80,         1,               1.005
>   5120,   2048,      0,         0,               1.005
>   5120,   2048,      0,         1,               1.005
>   5120,   2096,      0,         0,               1.068
>   5120,   2096,      0,         1,               1.068
>   5120,   2048,     48,         0,               1.065
>   5120,   2048,     48,         1,               1.065
>   5120,   2096,     48,         0,               1.005
>   5120,   2096,     48,         1,               1.005
>   5120,     48,      1,         0,               1.033
>   5120,     48,      1,         1,               1.031
>   5120,      1,     48,         0,               0.898
>   5120,      1,     48,         1,               0.898
>   5120,     80,      1,         0,               0.844
>   5120,     80,      1,         1,               0.844
>   5120,      1,     80,         0,               0.898
>   5120,      1,     80,         1,               0.898
>   5120,   2096,      1,         0,               0.856
>   5120,   2096,      1,         1,               0.855
>   5120,   2049,     48,         0,               0.898
>   5120,   2049,     48,         1,               0.898
>
> bench-memcpy-random:
>
>  length, New Time / Old Time
>   32768,               0.866
>   65536,               0.891
>  131072,               0.896
>  262144,               0.901
>  524288,               0.904
> 1048576,               0.913
>
> bench-memcpy-large:
>
>   length, align0, align1, dst>src, New Time/Old Time
>    65543,      0,      0,       0,             0.981
>    65543,      0,      0,       1,             0.981
>    65551,      0,      3,       0,             1.012
>    65551,      0,      3,       1,             1.013
>    65567,      3,      0,       0,             1.019
>    65567,      3,      0,       1,              1.02
>    65599,      3,      5,       0,             1.058
>    65599,      3,      5,       1,             1.061
>    65536,      0,    127,       0,             1.046
>    65536,      0,    127,       1,             1.046
>    65536,      0,    255,       0,             1.071
>    65536,      0,    255,       1,             1.071
>    65536,      0,    256,       0,             0.983
>    65536,      0,    256,       1,             0.984
>    65536,      0,   4064,       0,             1.017
>    65536,      0,   4064,       1,             1.018
>   131079,      0,      0,       0,             0.981
>   131079,      0,      0,       1,             0.981
>   131087,      0,      3,       0,             1.017
>   131087,      0,      3,       1,             1.017
>   131103,      3,      0,       0,             1.022
>   131103,      3,      0,       1,             1.022
>   131135,      3,      5,       0,             1.064
>   131135,      3,      5,       1,             1.065
>   131072,      0,    127,       0,              1.05
>   131072,      0,    127,       1,              1.05
>   131072,      0,    255,       0,             1.074
>   131072,      0,    255,       1,             1.074
>   131072,      0,    256,       0,             0.984
>   131072,      0,    256,       1,             0.984
>   131072,      0,   4064,       0,             1.018
>   131072,      0,   4064,       1,             1.019
>   262151,      0,      0,       0,             0.985
>   262151,      0,      0,       1,             0.985
>   262159,      0,      3,       0,             1.026
>   262159,      0,      3,       1,             1.026
>   262175,      3,      0,       0,              1.03
>   262175,      3,      0,       1,              1.03
>   262207,      3,      5,       0,              1.07
>   262207,      3,      5,       1,              1.07
>   262144,      0,    127,       0,             1.057
>   262144,      0,    127,       1,             1.057
>   262144,      0,    255,       0,             1.079
>   262144,      0,    255,       1,             1.078
>   262144,      0,    256,       0,             0.988
>   262144,      0,    256,       1,             0.988
>   262144,      0,   4064,       0,              1.02
>   262144,      0,   4064,       1,              1.02
>   524295,      0,      0,       0,             0.692
>   524295,      0,      0,       1,             0.692
>   524303,      0,      3,       0,             0.736
>   524303,      0,      3,       1,             0.737
>   524319,      3,      0,       0,             0.758
>   524319,      3,      0,       1,             0.759
>   524351,      3,      5,       0,             0.759
>   524351,      3,      5,       1,             0.759
>   524288,      0,    127,       0,             1.057
>   524288,      0,    127,       1,             1.058
>   524288,      0,    255,       0,             1.079
>   524288,      0,    255,       1,             1.079
>   524288,      0,    256,       0,             0.988
>   524288,      0,    256,       1,             0.988
>   524288,      0,   4064,       0,              1.02
>   524288,      0,   4064,       1,              1.02
>  1048583,      0,      0,       0,             0.948
>  1048583,      0,      0,       1,             0.948
>  1048591,      0,      3,       0,             0.735
>  1048591,      0,      3,       1,             0.735
>  1048607,      3,      0,       0,             0.757
>  1048607,      3,      0,       1,             0.758
>  1048639,      3,      5,       0,             0.758
>  1048639,      3,      5,       1,             0.758
>  1048576,      0,    127,       0,             0.761
>  1048576,      0,    127,       1,             0.762
>  1048576,      0,    255,       0,             0.751
>  1048576,      0,    255,       1,             0.751
>  1048576,      0,    256,       0,              0.93
>  1048576,      0,    256,       1,              0.93
>  1048576,      0,   4064,       0,              0.93
>  1048576,      0,   4064,       1,              0.93
>  2097159,      0,      0,       0,             0.928
>  2097159,      0,      0,       1,             0.931
>  2097167,      0,      3,       0,             0.735
>  2097167,      0,      3,       1,             0.734
>  2097183,      3,      0,       0,             0.759
>  2097183,      3,      0,       1,             0.759
>  2097215,      3,      5,       0,             0.758
>  2097215,      3,      5,       1,             0.757
>  2097152,      0,    127,       0,              0.77
>  2097152,      0,    127,       1,              0.77
>  2097152,      0,    255,       0,             0.745
>  2097152,      0,    255,       1,             0.745
>  2097152,      0,    256,       0,             0.924
>  2097152,      0,    256,       1,             0.925
>  2097152,      0,   4064,       0,             0.926
>  2097152,      0,   4064,       1,             0.927
>  4194311,      0,      0,       0,             0.894
>  4194311,      0,      0,       1,             0.896
>  4194319,      0,      3,       0,             0.752
>  4194319,      0,      3,       1,             0.751
>  4194335,      3,      0,       0,              0.82
>  4194335,      3,      0,       1,             0.821
>  4194367,      3,      5,       0,             0.788
>  4194367,      3,      5,       1,             0.789
>  4194304,      0,    127,       0,             0.801
>  4194304,      0,    127,       1,             0.801
>  4194304,      0,    255,       0,             0.802
>  4194304,      0,    255,       1,             0.804
>  4194304,      0,    256,       0,             0.873
>  4194304,      0,    256,       1,             0.868
>  4194304,      0,   4064,       0,             0.955
>  4194304,      0,   4064,       1,             0.954
>  8388615,      0,      0,       0,             0.885
>  8388615,      0,      0,       1,             0.886
>  8388623,      0,      3,       0,             0.769
>  8388623,      0,      3,       1,             0.769
>  8388639,      3,      0,       0,              0.87
>  8388639,      3,      0,       1,              0.87
>  8388671,      3,      5,       0,             0.811
>  8388671,      3,      5,       1,             0.814
>  8388608,      0,    127,       0,              0.83
>  8388608,      0,    127,       1,              0.83
>  8388608,      0,    255,       0,             0.857
>  8388608,      0,    255,       1,             0.857
>  8388608,      0,    256,       0,             0.851
>  8388608,      0,    256,       1,             0.848
>  8388608,      0,   4064,       0,             0.981
>  8388608,      0,   4064,       1,             0.981
> 16777223,      0,      0,       0,             0.885
> 16777223,      0,      0,       1,             0.886
> 16777231,      0,      3,       0,             0.769
> 16777231,      0,      3,       1,             0.768
> 16777247,      3,      0,       0,              0.87
> 16777247,      3,      0,       1,              0.87
> 16777279,      3,      5,       0,             0.811
> 16777279,      3,      5,       1,             0.814
> 16777216,      0,    127,       0,             0.831
> 16777216,      0,    127,       1,              0.83
> 16777216,      0,    255,       0,             0.857
> 16777216,      0,    255,       1,             0.857
> 16777216,      0,    256,       0,             0.852
> 16777216,      0,    256,       1,             0.848
> 16777216,      0,   4064,       0,              0.98
> 16777216,      0,   4064,       1,             0.981
> 33554439,      0,      0,       0,             0.885
> 33554439,      0,      0,       1,             0.886
> 33554447,      0,      3,       0,             0.768
> 33554447,      0,      3,       1,             0.768
> 33554463,      3,      0,       0,             0.871
> 33554463,      3,      0,       1,              0.87
> 33554495,      3,      5,       0,             0.811
> 33554495,      3,      5,       1,             0.814
> 33554432,      0,    127,       0,             0.831
> 33554432,      0,    127,       1,             0.831
> 33554432,      0,    255,       0,             0.858
> 33554432,      0,    255,       1,             0.857
> 33554432,      0,    256,       0,             0.852
> 33554432,      0,    256,       1,             0.848
> 33554432,      0,   4064,       0,              0.98
> 33554432,      0,   4064,       1,             0.981
>
>
>  sysdeps/x86_64/multiarch/Makefile          |    4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
>  sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
>  sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
>  sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
>  sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
>  6 files changed, 3572 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 2b3c625ea2..5b02ec8de5 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -46,13 +46,11 @@ sysdep_routines += \
>    stpcpy-evex \
>    stpcpy-sse2 \
>    stpcpy-sse2-unaligned \
> -  stpcpy-ssse3 \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
>    stpncpy-c \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
> -  stpncpy-ssse3 \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
>    strcasecmp_l-evex \
> @@ -83,7 +81,6 @@ sysdep_routines += \
>    strcpy-evex \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
> -  strcpy-ssse3 \
>    strcspn-c \
>    strcspn-sse2 \
>    strlen-avx2 \
> @@ -110,7 +107,6 @@ sysdep_routines += \
>    strncpy-c \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
> -  strncpy-ssse3 \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 41a04621ad..49ce6860d0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> -             IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpncpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> -             IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpcpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
>                               __strncpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_ssse3
> -#  endif
> -
> -       .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> -       mov     %rsi, %rcx
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -#  endif
> -       mov     %rdi, %rdx
> -#  ifdef USE_AS_STRNCPY
> -       test    %R8_LP, %R8_LP
> -       jz      L(Exit0)
> -       cmp     $8, %R8_LP
> -       jbe     L(StrncpyExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       jb      L(StrncpyExit15Bytes)
> -# endif
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -# endif
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       mov     %rcx, %rsi
> -       sub     $16, %r8
> -       and     $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> -       add     %rsi, %r8
> -# endif
> -       lea     16(%rcx), %rsi
> -       and     $-16, %rsi
> -       pxor    %xmm0, %xmm0
> -       mov     (%rcx), %r9
> -       mov     %r9, (%rdx)
> -       pcmpeqb (%rsi), %xmm0
> -       mov     8(%rcx), %r9
> -       mov     %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> -       pmovmskb %xmm0, %rax
> -       sub     %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       mov     %rdx, %rax
> -       lea     16(%rdx), %rdx
> -       and     $-16, %rdx
> -       sub     %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %rsi
> -       lea     -1(%rsi), %rsi
> -       and     $1<<31, %esi
> -       test    %rsi, %rsi
> -       jnz     L(ContinueCopy)
> -       lea     16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> -       sub     %rax, %rcx
> -       mov     %rcx, %rax
> -       and     $0xf, %rax
> -       mov     $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> -       jz      L(Align16Both)
> -
> -       cmp     $8, %rax
> -       jae     L(ShlHigh8)
> -       cmp     $1, %rax
> -       je      L(Shl1)
> -       cmp     $2, %rax
> -       je      L(Shl2)
> -       cmp     $3, %rax
> -       je      L(Shl3)
> -       cmp     $4, %rax
> -       je      L(Shl4)
> -       cmp     $5, %rax
> -       je      L(Shl5)
> -       cmp     $6, %rax
> -       je      L(Shl6)
> -       jmp     L(Shl7)
> -
> -L(ShlHigh8):
> -       je      L(Shl8)
> -       cmp     $9, %rax
> -       je      L(Shl9)
> -       cmp     $10, %rax
> -       je      L(Shl10)
> -       cmp     $11, %rax
> -       je      L(Shl11)
> -       cmp     $12, %rax
> -       je      L(Shl12)
> -       cmp     $13, %rax
> -       je      L(Shl13)
> -       cmp     $14, %rax
> -       je      L(Shl14)
> -       jmp     L(Shl15)
> -
> -L(Align16Both):
> -       movaps  (%rcx), %xmm1
> -       movaps  16(%rcx), %xmm2
> -       movaps  %xmm1, (%rdx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm4
> -       movaps  %xmm3, (%rdx, %rsi)
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm1
> -       movaps  %xmm4, (%rdx, %rsi)
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm2
> -       movaps  %xmm1, (%rdx, %rsi)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm3, (%rdx, %rsi)
> -       mov     %rcx, %rax
> -       lea     16(%rcx, %rsi), %rcx
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       lea     112(%r8, %rax), %r8
> -# endif
> -       mov     $-0x40, %rsi
> -
> -       .p2align 4
> -L(Aligned64Loop):
> -       movaps  (%rcx), %xmm2
> -       movaps  %xmm2, %xmm4
> -       movaps  16(%rcx), %xmm5
> -       movaps  32(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  48(%rcx), %xmm7
> -       pminub  %xmm5, %xmm2
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %rax
> -       lea     64(%rdx), %rdx
> -       lea     64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeaveCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Aligned64Leave)
> -       movaps  %xmm4, -64(%rdx)
> -       movaps  %xmm5, -48(%rdx)
> -       movaps  %xmm6, -32(%rdx)
> -       movaps  %xmm7, -16(%rdx)
> -       jmp     L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> -       lea     48(%r8), %r8
> -# endif
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm6, -32(%rdx)
> -       pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl1):
> -       movaps  -1(%rcx), %xmm1
> -       movaps  15(%rcx), %xmm2
> -L(Shl1Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     31(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -15(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl1LoopStart):
> -       movaps  15(%rcx), %xmm2
> -       movaps  31(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  47(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  63(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $1, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $1, %xmm3, %xmm4
> -       jnz     L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave1)
> -# endif
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> -       movdqu  -1(%rcx), %xmm1
> -       mov     $15, %rsi
> -       movdqu  %xmm1, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl2):
> -       movaps  -2(%rcx), %xmm1
> -       movaps  14(%rcx), %xmm2
> -L(Shl2Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     30(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -14(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl2LoopStart):
> -       movaps  14(%rcx), %xmm2
> -       movaps  30(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  46(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  62(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $2, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $2, %xmm3, %xmm4
> -       jnz     L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave2)
> -# endif
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> -       movdqu  -2(%rcx), %xmm1
> -       mov     $14, %rsi
> -       movdqu  %xmm1, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl3):
> -       movaps  -3(%rcx), %xmm1
> -       movaps  13(%rcx), %xmm2
> -L(Shl3Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     29(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -13(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl3LoopStart):
> -       movaps  13(%rcx), %xmm2
> -       movaps  29(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  45(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  61(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $3, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $3, %xmm3, %xmm4
> -       jnz     L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave3)
> -# endif
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> -       movdqu  -3(%rcx), %xmm1
> -       mov     $13, %rsi
> -       movdqu  %xmm1, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl4):
> -       movaps  -4(%rcx), %xmm1
> -       movaps  12(%rcx), %xmm2
> -L(Shl4Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     28(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -12(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl4LoopStart):
> -       movaps  12(%rcx), %xmm2
> -       movaps  28(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  44(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  60(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $4, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $4, %xmm3, %xmm4
> -       jnz     L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave4)
> -# endif
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> -       movdqu  -4(%rcx), %xmm1
> -       mov     $12, %rsi
> -       movdqu  %xmm1, -4(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl5):
> -       movaps  -5(%rcx), %xmm1
> -       movaps  11(%rcx), %xmm2
> -L(Shl5Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     27(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -11(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl5LoopStart):
> -       movaps  11(%rcx), %xmm2
> -       movaps  27(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  43(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  59(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $5, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $5, %xmm3, %xmm4
> -       jnz     L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave5)
> -# endif
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> -       movdqu  -5(%rcx), %xmm1
> -       mov     $11, %rsi
> -       movdqu  %xmm1, -5(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl6):
> -       movaps  -6(%rcx), %xmm1
> -       movaps  10(%rcx), %xmm2
> -L(Shl6Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     26(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -10(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl6LoopStart):
> -       movaps  10(%rcx), %xmm2
> -       movaps  26(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  42(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  58(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $6, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $6, %xmm3, %xmm4
> -       jnz     L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave6)
> -# endif
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> -       mov     (%rcx), %r9
> -       mov     6(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 6(%rdx)
> -       mov     $10, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl7):
> -       movaps  -7(%rcx), %xmm1
> -       movaps  9(%rcx), %xmm2
> -L(Shl7Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     25(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -9(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl7LoopStart):
> -       movaps  9(%rcx), %xmm2
> -       movaps  25(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  41(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  57(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $7, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $7, %xmm3, %xmm4
> -       jnz     L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave7)
> -# endif
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> -       mov     (%rcx), %r9
> -       mov     5(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 5(%rdx)
> -       mov     $9, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl8):
> -       movaps  -8(%rcx), %xmm1
> -       movaps  8(%rcx), %xmm2
> -L(Shl8Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     24(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -8(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl8LoopStart):
> -       movaps  8(%rcx), %xmm2
> -       movaps  24(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  40(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  56(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $8, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $8, %xmm3, %xmm4
> -       jnz     L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave8)
> -# endif
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl9):
> -       movaps  -9(%rcx), %xmm1
> -       movaps  7(%rcx), %xmm2
> -L(Shl9Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     23(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -7(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl9LoopStart):
> -       movaps  7(%rcx), %xmm2
> -       movaps  23(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  39(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  55(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $9, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $9, %xmm3, %xmm4
> -       jnz     L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave9)
> -# endif
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl10):
> -       movaps  -10(%rcx), %xmm1
> -       movaps  6(%rcx), %xmm2
> -L(Shl10Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     22(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -6(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl10LoopStart):
> -       movaps  6(%rcx), %xmm2
> -       movaps  22(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  38(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  54(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $10, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $10, %xmm3, %xmm4
> -       jnz     L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave10)
> -# endif
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl11):
> -       movaps  -11(%rcx), %xmm1
> -       movaps  5(%rcx), %xmm2
> -L(Shl11Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     21(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -5(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl11LoopStart):
> -       movaps  5(%rcx), %xmm2
> -       movaps  21(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  37(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  53(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $11, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $11, %xmm3, %xmm4
> -       jnz     L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave11)
> -# endif
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl12):
> -       movaps  -12(%rcx), %xmm1
> -       movaps  4(%rcx), %xmm2
> -L(Shl12Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     20(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -4(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl12LoopStart):
> -       movaps  4(%rcx), %xmm2
> -       movaps  20(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  36(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  52(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $12, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $12, %xmm3, %xmm4
> -       jnz     L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave12)
> -# endif
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl13):
> -       movaps  -13(%rcx), %xmm1
> -       movaps  3(%rcx), %xmm2
> -L(Shl13Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     19(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -3(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl13LoopStart):
> -       movaps  3(%rcx), %xmm2
> -       movaps  19(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  35(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  51(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $13, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $13, %xmm3, %xmm4
> -       jnz     L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave13)
> -# endif
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl14):
> -       movaps  -14(%rcx), %xmm1
> -       movaps  2(%rcx), %xmm2
> -L(Shl14Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     18(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -2(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl14LoopStart):
> -       movaps  2(%rcx), %xmm2
> -       movaps  18(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  34(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  50(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $14, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $14, %xmm3, %xmm4
> -       jnz     L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave14)
> -# endif
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl15):
> -       movaps  -15(%rcx), %xmm1
> -       movaps  1(%rcx), %xmm2
> -L(Shl15Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     17(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -1(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl15LoopStart):
> -       movaps  1(%rcx), %xmm2
> -       movaps  17(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  33(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  49(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $15, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $15, %xmm3, %xmm4
> -       jnz     L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave15)
> -# endif
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> -       jmp     L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -#  ifdef USE_AS_STRNCPY
> -       add     $16, %r8
> -#  endif
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -
> -       .p2align 4
> -L(Exit8):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $8, %r8
> -       lea     8(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -
> -       .p2align 4
> -L(Exit16):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %rax
> -       mov     %rax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     15(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       lea     16(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       jmp     L(Exit8)
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $15, %r8
> -       je      L(Exit15)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       jmp     L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -       cmp     $8, %r8
> -       je      L(Exit8)
> -       jg      L(More8Case3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       jg      L(More4Case3)
> -       cmp     $2, %r8
> -       jl      L(Exit1)
> -       je      L(Exit2)
> -       jg      L(Exit3)
> -L(More8Case3): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       jl      L(Less12Case3)
> -       cmp     $14, %r8
> -       jl      L(Exit13)
> -       je      L(Exit14)
> -       jg      L(Exit15)
> -L(More4Case3): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Exit5)
> -       je      L(Exit6)
> -       jg      L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Exit9)
> -       je      L(Exit10)
> -       jg      L(Exit11)
> -#  endif
> -
> -       .p2align 4
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $1, %r8
> -       lea     1(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $2, %r8
> -       lea     2(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $3, %r8
> -       lea     3(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit4):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     3(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $4, %r8
> -       lea     4(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit5):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     4(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $5, %r8
> -       lea     5(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit6):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     5(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $6, %r8
> -       lea     6(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit7):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movl    3(%rcx), %eax
> -       movl    %eax, 3(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     6(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $7, %r8
> -       lea     7(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit9):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %eax
> -       mov     %eax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     8(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $9, %r8
> -       lea     9(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit10):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %eax
> -       mov     %eax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     9(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $10, %r8
> -       lea     10(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit11):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     10(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $11, %r8
> -       lea     11(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit12):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     11(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $12, %r8
> -       lea     12(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit13):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %rax
> -       mov     %rax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     12(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $13, %r8
> -       lea     13(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit14):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %rax
> -       mov     %rax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     13(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $14, %r8
> -       lea     14(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit15):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $15, %r8
> -       lea     15(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(Fill0):
> -       ret
> -
> -       .p2align 4
> -L(Fill1):
> -       movb    %dl, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill2):
> -       movw    %dx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill3):
> -       movw    %dx, (%rcx)
> -       movb    %dl, 2(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill4):
> -       movl    %edx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill5):
> -       movl    %edx, (%rcx)
> -       movb    %dl, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill6):
> -       movl    %edx, (%rcx)
> -       movw    %dx, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill7):
> -       movl    %edx, (%rcx)
> -       movl    %edx, 3(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill8):
> -       mov     %rdx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill9):
> -       mov     %rdx, (%rcx)
> -       movb    %dl, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill10):
> -       mov     %rdx, (%rcx)
> -       movw    %dx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill11):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill12):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill13):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 5(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill14):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 6(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill15):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill16):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(StrncpyFillExit1):
> -       lea     16(%r8), %r8
> -L(FillFrom1To16Bytes):
> -       test    %r8, %r8
> -       jz      L(Fill0)
> -       cmp     $16, %r8
> -       je      L(Fill16)
> -       cmp     $8, %r8
> -       je      L(Fill8)
> -       jg      L(FillMore8)
> -       cmp     $4, %r8
> -       je      L(Fill4)
> -       jg      L(FillMore4)
> -       cmp     $2, %r8
> -       jl      L(Fill1)
> -       je      L(Fill2)
> -       jg      L(Fill3)
> -L(FillMore8): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Fill12)
> -       jl      L(FillLess12)
> -       cmp     $14, %r8
> -       jl      L(Fill13)
> -       je      L(Fill14)
> -       jg      L(Fill15)
> -L(FillMore4): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Fill5)
> -       je      L(Fill6)
> -       jg      L(Fill7)
> -L(FillLess12): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Fill9)
> -       je      L(Fill10)
> -       jmp     L(Fill11)
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero1):
> -       xor     %rdx, %rdx
> -       sub     $16, %r8
> -       jbe     L(StrncpyFillExit1)
> -
> -       pxor    %xmm0, %xmm0
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -
> -       lea     16(%rcx), %rcx
> -
> -       mov     %rcx, %rdx
> -       and     $0xf, %rdx
> -       sub     %rdx, %rcx
> -       add     %rdx, %r8
> -       xor     %rdx, %rdx
> -       sub     $64, %r8
> -       jb      L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       movdqa  %xmm0, 32(%rcx)
> -       movdqa  %xmm0, 48(%rcx)
> -       lea     64(%rcx), %rcx
> -       sub     $64, %r8
> -       jae     L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> -       add     $32, %r8
> -       jl      L(StrncpyFillLess32)
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       lea     32(%rcx), %rcx
> -       sub     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> -       add     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Exit0):
> -       mov     %rdx, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit8Bytes):
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -#  endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> -       lea     64(%r8), %r8
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       add     $48, %r8
> -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> -       .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> -       movdqu  -1(%rcx), %xmm0
> -       movdqu  %xmm0, -1(%rdx)
> -       mov     $15, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> -       movdqu  -2(%rcx), %xmm0
> -       movdqu  %xmm0, -2(%rdx)
> -       mov     $14, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> -       movdqu  -3(%rcx), %xmm0
> -       movdqu  %xmm0, -3(%rdx)
> -       mov     $13, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> -       movdqu  -4(%rcx), %xmm0
> -       movdqu  %xmm0, -4(%rdx)
> -       mov     $12, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> -       movdqu  -5(%rcx), %xmm0
> -       movdqu  %xmm0, -5(%rdx)
> -       mov     $11, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     6(%rcx), %r9d
> -       mov     %r9d, 6(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $10, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     5(%rcx), %r9d
> -       mov     %r9d, 5(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $9, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave1):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit1)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit1):
> -       lea     15(%rdx, %rsi), %rdx
> -       lea     15(%rcx, %rsi), %rcx
> -       mov     -15(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -15(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave2):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit2)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit2):
> -       lea     14(%rdx, %rsi), %rdx
> -       lea     14(%rcx, %rsi), %rcx
> -       mov     -14(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -14(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave3):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit3)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit3):
> -       lea     13(%rdx, %rsi), %rdx
> -       lea     13(%rcx, %rsi), %rcx
> -       mov     -13(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -13(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave4):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit4)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit4):
> -       lea     12(%rdx, %rsi), %rdx
> -       lea     12(%rcx, %rsi), %rcx
> -       mov     -12(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -12(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave5):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit5)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit5):
> -       lea     11(%rdx, %rsi), %rdx
> -       lea     11(%rcx, %rsi), %rcx
> -       mov     -11(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -11(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave6):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit6)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit6):
> -       lea     10(%rdx, %rsi), %rdx
> -       lea     10(%rcx, %rsi), %rcx
> -       mov     -10(%rcx), %rsi
> -       movw    -2(%rcx), %ax
> -       mov     %rsi, -10(%rdx)
> -       movw    %ax, -2(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave7):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit7)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit7):
> -       lea     9(%rdx, %rsi), %rdx
> -       lea     9(%rcx, %rsi), %rcx
> -       mov     -9(%rcx), %rsi
> -       movb    -1(%rcx), %ah
> -       mov     %rsi, -9(%rdx)
> -       movb    %ah, -1(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave8):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit8)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit8):
> -       lea     8(%rdx, %rsi), %rdx
> -       lea     8(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave9):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit9)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit9):
> -       lea     7(%rdx, %rsi), %rdx
> -       lea     7(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave10):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit10)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit10):
> -       lea     6(%rdx, %rsi), %rdx
> -       lea     6(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave11):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit11)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit11):
> -       lea     5(%rdx, %rsi), %rdx
> -       lea     5(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave12):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit12)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit12):
> -       lea     4(%rdx, %rsi), %rdx
> -       lea     4(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave13):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit13)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit13):
> -       lea     3(%rdx, %rsi), %rdx
> -       lea     3(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave14):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit14)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit14):
> -       lea     2(%rdx, %rsi), %rdx
> -       lea     2(%rcx, %rsi), %rcx
> -       movw    -2(%rcx), %ax
> -       xor     %rsi, %rsi
> -       movw    %ax, -2(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave15):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit15)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit15):
> -       lea     1(%rdx, %rsi), %rdx
> -       lea     1(%rcx, %rsi), %rcx
> -       movb    -1(%rcx), %ah
> -       xor     %rsi, %rsi
> -       movb    %ah, -1(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (2 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 1/6] " Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:42   ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
                     ` (5 subsequent siblings)
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |   16 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 6 insertions(+), 3212 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
   memcmpeq-evex \
   memcmpeq-sse2 \
   memcpy-ssse3 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
   memmove-ssse3 \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      return OPTIMIZE (ssse3);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-04-10  0:42   ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:45 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
>  sysdeps/x86_64/multiarch/ifunc-memmove.h      |   16 +-
>  sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
>  sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
>  5 files changed, 6 insertions(+), 3212 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
>  delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5b02ec8de5..303fb5d734 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,6 @@ sysdep_routines += \
>    memcmpeq-evex \
>    memcmpeq-sse2 \
>    memcpy-ssse3 \
> -  memcpy-ssse3-back \
>    memmove-avx-unaligned-erms \
>    memmove-avx-unaligned-erms-rtm \
>    memmove-avx512-no-vzeroupper \
> @@ -25,7 +24,6 @@ sysdep_routines += \
>    memmove-evex-unaligned-erms \
>    memmove-sse2-unaligned-erms \
>    memmove-ssse3 \
> -  memmove-ssse3-back \
>    memrchr-avx2 \
>    memrchr-avx2-rtm \
>    memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 49ce6860d0..c6008a73ed 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memmove_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_chk_ssse3)
> @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_avx512_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_ssse3)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_chk_ssse3)
> @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, memcpy,
> @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_chk_ssse3)
> @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, mempcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..fb01fbb301 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
>    attribute_hidden;
> @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
>         }
>      }
>
> -  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> -      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
>      {
> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -       return OPTIMIZE (sse2_unaligned_erms);
> -
> -      return OPTIMIZE (sse2_unaligned);
> +      return OPTIMIZE (ssse3);
>      }
>
> -  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> -    return OPTIMIZE (ssse3_back);
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE (sse2_unaligned_erms);
>
> -  return OPTIMIZE (ssse3);
> +  return OPTIMIZE (sse2_unaligned);
>  }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY                __memcpy_ssse3_back
> -# define MEMCPY_CHK    __memcpy_chk_ssse3_back
> -# define MEMPCPY       __mempcpy_ssse3_back
> -# define MEMPCPY_CHK   __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B)   I - B
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -   relative offsets.  INDEX is a register contains the index into the
> -   jump table.  SCALE is the scale of INDEX.  */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
> -  lea          TABLE(%rip), %r11;                              \
> -  movslq       (%r11, INDEX, SCALE), INDEX;                    \
> -  lea          (%r11, INDEX), INDEX;                           \
> -  _CET_NOTRACK jmp *INDEX;                                     \
> -  ud2
> -
> -       .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> -       mov     %RDI_LP, %RAX_LP
> -       add     %RDX_LP, %RAX_LP
> -       jmp     L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> -       mov     %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> -       add     %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> -       cmp     %rsi, %rdi
> -       jb      L(copy_forward)
> -       je      L(bwd_write_0bytes)
> -       cmp     $144, %rdx
> -       jae     L(copy_backward)
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> -       cmp     $144, %rdx
> -       jae     L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jbe     L(bk_write)
> -#endif
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> -       .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jle     L(copy_backward)
> -#endif
> -       movdqu  (%rsi), %xmm0
> -       mov     %rdi, %r8
> -       and     $-16, %rdi
> -       add     $16, %rdi
> -       mov     %rdi, %r9
> -       sub     %r8, %r9
> -       sub     %r9, %rdx
> -       add     %r9, %rsi
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       cmp     %rcx, %rdx
> -       jae     L(gobble_mem_fwd)
> -       lea     L(shl_table_fwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       shl     $1, %rcx
> -       cmp     %rcx, %rdx
> -       ja      L(gobble_mem_bwd)
> -
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $0xf, %r9
> -       xor     %r9, %rdi
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0_bwd)
> -       lea     L(shl_table_bwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(shl_0):
> -
> -       mov     %rdx, %r9
> -       shr     $8, %r9
> -       add     %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> -       cmp     $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> -       jae     L(gobble_mem_fwd)
> -       sub     $0x80, %rdx
> -       .p2align 4
> -L(shl_0_loop):
> -       movdqa  (%rsi), %xmm1
> -       movdqa  %xmm1, (%rdi)
> -       movaps  0x10(%rsi), %xmm2
> -       movaps  %xmm2, 0x10(%rdi)
> -       movaps  0x20(%rsi), %xmm3
> -       movaps  %xmm3, 0x20(%rdi)
> -       movaps  0x30(%rsi), %xmm4
> -       movaps  %xmm4, 0x30(%rdi)
> -       movaps  0x40(%rsi), %xmm1
> -       movaps  %xmm1, 0x40(%rdi)
> -       movaps  0x50(%rsi), %xmm2
> -       movaps  %xmm2, 0x50(%rdi)
> -       movaps  0x60(%rsi), %xmm3
> -       movaps  %xmm3, 0x60(%rdi)
> -       movaps  0x70(%rsi), %xmm4
> -       movaps  %xmm4, 0x70(%rdi)
> -       sub     $0x80, %rdx
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_0_loop)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_bwd):
> -       sub     $0x80, %rdx
> -L(copy_backward_loop):
> -       movaps  -0x10(%rsi), %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  -0x20(%rsi), %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -       movaps  -0x30(%rsi), %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -       movaps  -0x40(%rsi), %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -       movaps  -0x50(%rsi), %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -       movaps  -0x60(%rsi), %xmm5
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  -0x70(%rsi), %xmm5
> -       movaps  %xmm5, -0x70(%rdi)
> -       movaps  -0x80(%rsi), %xmm5
> -       movaps  %xmm5, -0x80(%rdi)
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(copy_backward_loop)
> -
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1):
> -       sub     $0x80, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       movaps  0x0f(%rsi), %xmm2
> -       movaps  0x1f(%rsi), %xmm3
> -       movaps  0x2f(%rsi), %xmm4
> -       movaps  0x3f(%rsi), %xmm5
> -       movaps  0x4f(%rsi), %xmm6
> -       movaps  0x5f(%rsi), %xmm7
> -       movaps  0x6f(%rsi), %xmm8
> -       movaps  0x7f(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $1, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $1, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $1, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $1, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $1, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $1, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $1, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_1)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1_bwd):
> -       movaps  -0x01(%rsi), %xmm1
> -
> -       movaps  -0x11(%rsi), %xmm2
> -       palignr $1, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x21(%rsi), %xmm3
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x31(%rsi), %xmm4
> -       palignr $1, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x41(%rsi), %xmm5
> -       palignr $1, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x51(%rsi), %xmm6
> -       palignr $1, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x61(%rsi), %xmm7
> -       palignr $1, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x71(%rsi), %xmm8
> -       palignr $1, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x81(%rsi), %xmm9
> -       palignr $1, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_1_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2):
> -       sub     $0x80, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       movaps  0x0e(%rsi), %xmm2
> -       movaps  0x1e(%rsi), %xmm3
> -       movaps  0x2e(%rsi), %xmm4
> -       movaps  0x3e(%rsi), %xmm5
> -       movaps  0x4e(%rsi), %xmm6
> -       movaps  0x5e(%rsi), %xmm7
> -       movaps  0x6e(%rsi), %xmm8
> -       movaps  0x7e(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $2, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $2, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $2, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $2, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $2, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $2, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $2, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_2)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2_bwd):
> -       movaps  -0x02(%rsi), %xmm1
> -
> -       movaps  -0x12(%rsi), %xmm2
> -       palignr $2, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x22(%rsi), %xmm3
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x32(%rsi), %xmm4
> -       palignr $2, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x42(%rsi), %xmm5
> -       palignr $2, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x52(%rsi), %xmm6
> -       palignr $2, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x62(%rsi), %xmm7
> -       palignr $2, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x72(%rsi), %xmm8
> -       palignr $2, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x82(%rsi), %xmm9
> -       palignr $2, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_2_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3):
> -       sub     $0x80, %rdx
> -       movaps -0x03(%rsi), %xmm1
> -       movaps  0x0d(%rsi), %xmm2
> -       movaps  0x1d(%rsi), %xmm3
> -       movaps  0x2d(%rsi), %xmm4
> -       movaps  0x3d(%rsi), %xmm5
> -       movaps  0x4d(%rsi), %xmm6
> -       movaps  0x5d(%rsi), %xmm7
> -       movaps  0x6d(%rsi), %xmm8
> -       movaps  0x7d(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $3, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $3, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $3, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $3, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $3, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $3, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $3, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_3)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3_bwd):
> -       movaps  -0x03(%rsi), %xmm1
> -
> -       movaps  -0x13(%rsi), %xmm2
> -       palignr $3, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x23(%rsi), %xmm3
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x33(%rsi), %xmm4
> -       palignr $3, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x43(%rsi), %xmm5
> -       palignr $3, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x53(%rsi), %xmm6
> -       palignr $3, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x63(%rsi), %xmm7
> -       palignr $3, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x73(%rsi), %xmm8
> -       palignr $3, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x83(%rsi), %xmm9
> -       palignr $3, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_3_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4):
> -       sub     $0x80, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       movaps  0x0c(%rsi), %xmm2
> -       movaps  0x1c(%rsi), %xmm3
> -       movaps  0x2c(%rsi), %xmm4
> -       movaps  0x3c(%rsi), %xmm5
> -       movaps  0x4c(%rsi), %xmm6
> -       movaps  0x5c(%rsi), %xmm7
> -       movaps  0x6c(%rsi), %xmm8
> -       movaps  0x7c(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $4, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $4, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $4, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $4, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $4, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $4, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $4, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_4)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4_bwd):
> -       movaps  -0x04(%rsi), %xmm1
> -
> -       movaps  -0x14(%rsi), %xmm2
> -       palignr $4, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x24(%rsi), %xmm3
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x34(%rsi), %xmm4
> -       palignr $4, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x44(%rsi), %xmm5
> -       palignr $4, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x54(%rsi), %xmm6
> -       palignr $4, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x64(%rsi), %xmm7
> -       palignr $4, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x74(%rsi), %xmm8
> -       palignr $4, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x84(%rsi), %xmm9
> -       palignr $4, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_4_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5):
> -       sub     $0x80, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       movaps  0x0b(%rsi), %xmm2
> -       movaps  0x1b(%rsi), %xmm3
> -       movaps  0x2b(%rsi), %xmm4
> -       movaps  0x3b(%rsi), %xmm5
> -       movaps  0x4b(%rsi), %xmm6
> -       movaps  0x5b(%rsi), %xmm7
> -       movaps  0x6b(%rsi), %xmm8
> -       movaps  0x7b(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $5, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $5, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $5, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $5, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $5, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $5, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $5, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_5)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5_bwd):
> -       movaps  -0x05(%rsi), %xmm1
> -
> -       movaps  -0x15(%rsi), %xmm2
> -       palignr $5, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x25(%rsi), %xmm3
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x35(%rsi), %xmm4
> -       palignr $5, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x45(%rsi), %xmm5
> -       palignr $5, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x55(%rsi), %xmm6
> -       palignr $5, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x65(%rsi), %xmm7
> -       palignr $5, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x75(%rsi), %xmm8
> -       palignr $5, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x85(%rsi), %xmm9
> -       palignr $5, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_5_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6):
> -       sub     $0x80, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       movaps  0x0a(%rsi), %xmm2
> -       movaps  0x1a(%rsi), %xmm3
> -       movaps  0x2a(%rsi), %xmm4
> -       movaps  0x3a(%rsi), %xmm5
> -       movaps  0x4a(%rsi), %xmm6
> -       movaps  0x5a(%rsi), %xmm7
> -       movaps  0x6a(%rsi), %xmm8
> -       movaps  0x7a(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $6, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $6, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $6, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $6, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $6, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $6, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $6, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_6)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6_bwd):
> -       movaps  -0x06(%rsi), %xmm1
> -
> -       movaps  -0x16(%rsi), %xmm2
> -       palignr $6, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x26(%rsi), %xmm3
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x36(%rsi), %xmm4
> -       palignr $6, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x46(%rsi), %xmm5
> -       palignr $6, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x56(%rsi), %xmm6
> -       palignr $6, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x66(%rsi), %xmm7
> -       palignr $6, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x76(%rsi), %xmm8
> -       palignr $6, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x86(%rsi), %xmm9
> -       palignr $6, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_6_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7):
> -       sub     $0x80, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       movaps  0x09(%rsi), %xmm2
> -       movaps  0x19(%rsi), %xmm3
> -       movaps  0x29(%rsi), %xmm4
> -       movaps  0x39(%rsi), %xmm5
> -       movaps  0x49(%rsi), %xmm6
> -       movaps  0x59(%rsi), %xmm7
> -       movaps  0x69(%rsi), %xmm8
> -       movaps  0x79(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $7, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $7, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $7, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $7, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $7, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $7, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $7, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_7)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7_bwd):
> -       movaps  -0x07(%rsi), %xmm1
> -
> -       movaps  -0x17(%rsi), %xmm2
> -       palignr $7, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x27(%rsi), %xmm3
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x37(%rsi), %xmm4
> -       palignr $7, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x47(%rsi), %xmm5
> -       palignr $7, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x57(%rsi), %xmm6
> -       palignr $7, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x67(%rsi), %xmm7
> -       palignr $7, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x77(%rsi), %xmm8
> -       palignr $7, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x87(%rsi), %xmm9
> -       palignr $7, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_7_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8):
> -       sub     $0x80, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       movaps  0x08(%rsi), %xmm2
> -       movaps  0x18(%rsi), %xmm3
> -       movaps  0x28(%rsi), %xmm4
> -       movaps  0x38(%rsi), %xmm5
> -       movaps  0x48(%rsi), %xmm6
> -       movaps  0x58(%rsi), %xmm7
> -       movaps  0x68(%rsi), %xmm8
> -       movaps  0x78(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $8, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $8, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $8, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $8, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $8, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $8, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $8, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_8)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8_bwd):
> -       movaps  -0x08(%rsi), %xmm1
> -
> -       movaps  -0x18(%rsi), %xmm2
> -       palignr $8, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x28(%rsi), %xmm3
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x38(%rsi), %xmm4
> -       palignr $8, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x48(%rsi), %xmm5
> -       palignr $8, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x58(%rsi), %xmm6
> -       palignr $8, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x68(%rsi), %xmm7
> -       palignr $8, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x78(%rsi), %xmm8
> -       palignr $8, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x88(%rsi), %xmm9
> -       palignr $8, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_8_bwd)
> -L(shl_8_end_bwd):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9):
> -       sub     $0x80, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       movaps  0x07(%rsi), %xmm2
> -       movaps  0x17(%rsi), %xmm3
> -       movaps  0x27(%rsi), %xmm4
> -       movaps  0x37(%rsi), %xmm5
> -       movaps  0x47(%rsi), %xmm6
> -       movaps  0x57(%rsi), %xmm7
> -       movaps  0x67(%rsi), %xmm8
> -       movaps  0x77(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $9, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $9, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $9, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $9, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $9, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $9, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $9, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_9)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9_bwd):
> -       movaps  -0x09(%rsi), %xmm1
> -
> -       movaps  -0x19(%rsi), %xmm2
> -       palignr $9, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x29(%rsi), %xmm3
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x39(%rsi), %xmm4
> -       palignr $9, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x49(%rsi), %xmm5
> -       palignr $9, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x59(%rsi), %xmm6
> -       palignr $9, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x69(%rsi), %xmm7
> -       palignr $9, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x79(%rsi), %xmm8
> -       palignr $9, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x89(%rsi), %xmm9
> -       palignr $9, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_9_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10):
> -       sub     $0x80, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       movaps  0x06(%rsi), %xmm2
> -       movaps  0x16(%rsi), %xmm3
> -       movaps  0x26(%rsi), %xmm4
> -       movaps  0x36(%rsi), %xmm5
> -       movaps  0x46(%rsi), %xmm6
> -       movaps  0x56(%rsi), %xmm7
> -       movaps  0x66(%rsi), %xmm8
> -       movaps  0x76(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $10, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $10, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $10, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $10, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $10, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $10, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $10, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_10)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10_bwd):
> -       movaps  -0x0a(%rsi), %xmm1
> -
> -       movaps  -0x1a(%rsi), %xmm2
> -       palignr $10, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2a(%rsi), %xmm3
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3a(%rsi), %xmm4
> -       palignr $10, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4a(%rsi), %xmm5
> -       palignr $10, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5a(%rsi), %xmm6
> -       palignr $10, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6a(%rsi), %xmm7
> -       palignr $10, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7a(%rsi), %xmm8
> -       palignr $10, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8a(%rsi), %xmm9
> -       palignr $10, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_10_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11):
> -       sub     $0x80, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       movaps  0x05(%rsi), %xmm2
> -       movaps  0x15(%rsi), %xmm3
> -       movaps  0x25(%rsi), %xmm4
> -       movaps  0x35(%rsi), %xmm5
> -       movaps  0x45(%rsi), %xmm6
> -       movaps  0x55(%rsi), %xmm7
> -       movaps  0x65(%rsi), %xmm8
> -       movaps  0x75(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $11, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $11, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $11, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $11, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $11, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $11, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $11, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_11)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11_bwd):
> -       movaps  -0x0b(%rsi), %xmm1
> -
> -       movaps  -0x1b(%rsi), %xmm2
> -       palignr $11, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2b(%rsi), %xmm3
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3b(%rsi), %xmm4
> -       palignr $11, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4b(%rsi), %xmm5
> -       palignr $11, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5b(%rsi), %xmm6
> -       palignr $11, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6b(%rsi), %xmm7
> -       palignr $11, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7b(%rsi), %xmm8
> -       palignr $11, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8b(%rsi), %xmm9
> -       palignr $11, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_11_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12):
> -       sub     $0x80, %rdx
> -       movdqa  -0x0c(%rsi), %xmm1
> -       movaps  0x04(%rsi), %xmm2
> -       movaps  0x14(%rsi), %xmm3
> -       movaps  0x24(%rsi), %xmm4
> -       movaps  0x34(%rsi), %xmm5
> -       movaps  0x44(%rsi), %xmm6
> -       movaps  0x54(%rsi), %xmm7
> -       movaps  0x64(%rsi), %xmm8
> -       movaps  0x74(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $12, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $12, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $12, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $12, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $12, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $12, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $12, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_12)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12_bwd):
> -       movaps  -0x0c(%rsi), %xmm1
> -
> -       movaps  -0x1c(%rsi), %xmm2
> -       palignr $12, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2c(%rsi), %xmm3
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3c(%rsi), %xmm4
> -       palignr $12, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4c(%rsi), %xmm5
> -       palignr $12, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5c(%rsi), %xmm6
> -       palignr $12, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6c(%rsi), %xmm7
> -       palignr $12, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7c(%rsi), %xmm8
> -       palignr $12, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8c(%rsi), %xmm9
> -       palignr $12, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_12_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13):
> -       sub     $0x80, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       movaps  0x03(%rsi), %xmm2
> -       movaps  0x13(%rsi), %xmm3
> -       movaps  0x23(%rsi), %xmm4
> -       movaps  0x33(%rsi), %xmm5
> -       movaps  0x43(%rsi), %xmm6
> -       movaps  0x53(%rsi), %xmm7
> -       movaps  0x63(%rsi), %xmm8
> -       movaps  0x73(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $13, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $13, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $13, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $13, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $13, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $13, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $13, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_13)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13_bwd):
> -       movaps  -0x0d(%rsi), %xmm1
> -
> -       movaps  -0x1d(%rsi), %xmm2
> -       palignr $13, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2d(%rsi), %xmm3
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3d(%rsi), %xmm4
> -       palignr $13, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4d(%rsi), %xmm5
> -       palignr $13, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5d(%rsi), %xmm6
> -       palignr $13, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6d(%rsi), %xmm7
> -       palignr $13, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7d(%rsi), %xmm8
> -       palignr $13, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8d(%rsi), %xmm9
> -       palignr $13, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_13_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14):
> -       sub     $0x80, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       movaps  0x02(%rsi), %xmm2
> -       movaps  0x12(%rsi), %xmm3
> -       movaps  0x22(%rsi), %xmm4
> -       movaps  0x32(%rsi), %xmm5
> -       movaps  0x42(%rsi), %xmm6
> -       movaps  0x52(%rsi), %xmm7
> -       movaps  0x62(%rsi), %xmm8
> -       movaps  0x72(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $14, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $14, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $14, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $14, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $14, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $14, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $14, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_14)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14_bwd):
> -       movaps  -0x0e(%rsi), %xmm1
> -
> -       movaps  -0x1e(%rsi), %xmm2
> -       palignr $14, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2e(%rsi), %xmm3
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3e(%rsi), %xmm4
> -       palignr $14, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4e(%rsi), %xmm5
> -       palignr $14, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5e(%rsi), %xmm6
> -       palignr $14, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6e(%rsi), %xmm7
> -       palignr $14, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7e(%rsi), %xmm8
> -       palignr $14, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8e(%rsi), %xmm9
> -       palignr $14, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_14_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15):
> -       sub     $0x80, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       movaps  0x01(%rsi), %xmm2
> -       movaps  0x11(%rsi), %xmm3
> -       movaps  0x21(%rsi), %xmm4
> -       movaps  0x31(%rsi), %xmm5
> -       movaps  0x41(%rsi), %xmm6
> -       movaps  0x51(%rsi), %xmm7
> -       movaps  0x61(%rsi), %xmm8
> -       movaps  0x71(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $15, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $15, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $15, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $15, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $15, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $15, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $15, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_15)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15_bwd):
> -       movaps  -0x0f(%rsi), %xmm1
> -
> -       movaps  -0x1f(%rsi), %xmm2
> -       palignr $15, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2f(%rsi), %xmm3
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3f(%rsi), %xmm4
> -       palignr $15, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4f(%rsi), %xmm5
> -       palignr $15, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5f(%rsi), %xmm6
> -       palignr $15, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6f(%rsi), %xmm7
> -       palignr $15, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7f(%rsi), %xmm8
> -       palignr $15, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8f(%rsi), %xmm9
> -       palignr $15, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_15_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_fwd):
> -       movdqu  (%rsi), %xmm1
> -       movdqu  %xmm0, (%r8)
> -       movdqa  %xmm1, (%rdi)
> -       sub     $16, %rdx
> -       add     $16, %rsi
> -       add     $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rsi, %r9
> -       sub     %rdi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_fwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger_in_fwd)
> -       mov     %rdx, %rcx
> -L(bigger_in_fwd):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy_fwd)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy_fwd)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy_fwd):
> -       sub     $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 0x200(%rsi)
> -       prefetcht0 0x300(%rsi)
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lfence
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       movntdq %xmm4, 0x40(%rdi)
> -       movntdq %xmm5, 0x50(%rdi)
> -       movntdq %xmm6, 0x60(%rdi)
> -       movntdq %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_mem_fwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_fwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy_fwd):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> -       prefetchnta 0x1c0(%rsi)
> -       prefetchnta 0x280(%rsi)
> -       prefetchnta 0x1c0(%rdi)
> -       prefetchnta 0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       movdqa  %xmm2, 0x20(%rdi)
> -       movdqa  %xmm3, 0x30(%rdi)
> -       movdqa  %xmm4, 0x40(%rdi)
> -       movdqa  %xmm5, 0x50(%rdi)
> -       movdqa  %xmm6, 0x60(%rdi)
> -       movdqa  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_bwd):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $-16, %rdi
> -       sub     %rdi, %r9
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rdi, %r9
> -       sub     %rsi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_bwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger)
> -       mov     %rdx, %rcx
> -L(bigger):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy):
> -       sub     $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 -0x200(%rsi)
> -       prefetcht0 -0x300(%rsi)
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       lfence
> -       movntdq %xmm1, -0x10(%rdi)
> -       movntdq %xmm2, -0x20(%rdi)
> -       movntdq %xmm3, -0x30(%rdi)
> -       movntdq %xmm4, -0x40(%rdi)
> -       movntdq %xmm5, -0x50(%rdi)
> -       movntdq %xmm6, -0x60(%rdi)
> -       movntdq %xmm7, -0x70(%rdi)
> -       movntdq %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_mem_bwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_bwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop):
> -       prefetchnta -0x1c0(%rsi)
> -       prefetchnta -0x280(%rsi)
> -       prefetchnta -0x1c0(%rdi)
> -       prefetchnta -0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       movdqa  %xmm1, -0x10(%rdi)
> -       movdqa  %xmm2, -0x20(%rdi)
> -       movdqa  %xmm3, -0x30(%rdi)
> -       movdqa  %xmm4, -0x40(%rdi)
> -       movdqa  %xmm5, -0x50(%rdi)
> -       movdqa  %xmm6, -0x60(%rdi)
> -       movdqa  %xmm7, -0x70(%rdi)
> -       movdqa  %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rsi
> -       sub     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(fwd_write_128bytes):
> -       lddqu   -128(%rsi), %xmm0
> -       movdqu  %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> -       lddqu   -112(%rsi), %xmm0
> -       movdqu  %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> -       lddqu   -96(%rsi), %xmm0
> -       movdqu  %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> -       lddqu   -80(%rsi), %xmm0
> -       movdqu  %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> -       lddqu   -64(%rsi), %xmm0
> -       movdqu  %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> -       lddqu   -48(%rsi), %xmm0
> -       movdqu  %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> -       lddqu   -32(%rsi), %xmm0
> -       movdqu  %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> -       lddqu   -16(%rsi), %xmm0
> -       movdqu  %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> -       ret
> -
> -
> -       .p2align 4
> -L(fwd_write_143bytes):
> -       lddqu   -143(%rsi), %xmm0
> -       movdqu  %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> -       lddqu   -127(%rsi), %xmm0
> -       movdqu  %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> -       lddqu   -111(%rsi), %xmm0
> -       movdqu  %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> -       lddqu   -95(%rsi), %xmm0
> -       movdqu  %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> -       lddqu   -79(%rsi), %xmm0
> -       movdqu  %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> -       lddqu   -63(%rsi), %xmm0
> -       movdqu  %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> -       lddqu   -47(%rsi), %xmm0
> -       movdqu  %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> -       lddqu   -31(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -31(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_15bytes):
> -       mov     -15(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -15(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_142bytes):
> -       lddqu   -142(%rsi), %xmm0
> -       movdqu  %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> -       lddqu   -126(%rsi), %xmm0
> -       movdqu  %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> -       lddqu   -110(%rsi), %xmm0
> -       movdqu  %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> -       lddqu   -94(%rsi), %xmm0
> -       movdqu  %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> -       lddqu   -78(%rsi), %xmm0
> -       movdqu  %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> -       lddqu   -62(%rsi), %xmm0
> -       movdqu  %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> -       lddqu   -46(%rsi), %xmm0
> -       movdqu  %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> -       lddqu   -30(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -30(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_14bytes):
> -       mov     -14(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -14(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_141bytes):
> -       lddqu   -141(%rsi), %xmm0
> -       movdqu  %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> -       lddqu   -125(%rsi), %xmm0
> -       movdqu  %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> -       lddqu   -109(%rsi), %xmm0
> -       movdqu  %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> -       lddqu   -93(%rsi), %xmm0
> -       movdqu  %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> -       lddqu   -77(%rsi), %xmm0
> -       movdqu  %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> -       lddqu   -61(%rsi), %xmm0
> -       movdqu  %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> -       lddqu   -45(%rsi), %xmm0
> -       movdqu  %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> -       lddqu   -29(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -29(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_13bytes):
> -       mov     -13(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -13(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_140bytes):
> -       lddqu   -140(%rsi), %xmm0
> -       movdqu  %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> -       lddqu   -124(%rsi), %xmm0
> -       movdqu  %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> -       lddqu   -108(%rsi), %xmm0
> -       movdqu  %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> -       lddqu   -92(%rsi), %xmm0
> -       movdqu  %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> -       lddqu   -76(%rsi), %xmm0
> -       movdqu  %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> -       lddqu   -60(%rsi), %xmm0
> -       movdqu  %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> -       lddqu   -44(%rsi), %xmm0
> -       movdqu  %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> -       lddqu   -28(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -28(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_12bytes):
> -       mov     -12(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -12(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_139bytes):
> -       lddqu   -139(%rsi), %xmm0
> -       movdqu  %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> -       lddqu   -123(%rsi), %xmm0
> -       movdqu  %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> -       lddqu   -107(%rsi), %xmm0
> -       movdqu  %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> -       lddqu   -91(%rsi), %xmm0
> -       movdqu  %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> -       lddqu   -75(%rsi), %xmm0
> -       movdqu  %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> -       lddqu   -59(%rsi), %xmm0
> -       movdqu  %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> -       lddqu   -43(%rsi), %xmm0
> -       movdqu  %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> -       lddqu   -27(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -27(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_11bytes):
> -       mov     -11(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -11(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_138bytes):
> -       lddqu   -138(%rsi), %xmm0
> -       movdqu  %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> -       lddqu   -122(%rsi), %xmm0
> -       movdqu  %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> -       lddqu   -106(%rsi), %xmm0
> -       movdqu  %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> -       lddqu   -90(%rsi), %xmm0
> -       movdqu  %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> -       lddqu   -74(%rsi), %xmm0
> -       movdqu  %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> -       lddqu   -58(%rsi), %xmm0
> -       movdqu  %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> -       lddqu   -42(%rsi), %xmm0
> -       movdqu  %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> -       lddqu   -26(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -26(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_10bytes):
> -       mov     -10(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -10(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_137bytes):
> -       lddqu   -137(%rsi), %xmm0
> -       movdqu  %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> -       lddqu   -121(%rsi), %xmm0
> -       movdqu  %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> -       lddqu   -105(%rsi), %xmm0
> -       movdqu  %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> -       lddqu   -89(%rsi), %xmm0
> -       movdqu  %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> -       lddqu   -73(%rsi), %xmm0
> -       movdqu  %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> -       lddqu   -57(%rsi), %xmm0
> -       movdqu  %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> -       lddqu   -41(%rsi), %xmm0
> -       movdqu  %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> -       lddqu   -25(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -25(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_9bytes):
> -       mov     -9(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -9(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_136bytes):
> -       lddqu   -136(%rsi), %xmm0
> -       movdqu  %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> -       lddqu   -120(%rsi), %xmm0
> -       movdqu  %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> -       lddqu   -104(%rsi), %xmm0
> -       movdqu  %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> -       lddqu   -88(%rsi), %xmm0
> -       movdqu  %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> -       lddqu   -72(%rsi), %xmm0
> -       movdqu  %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> -       lddqu   -56(%rsi), %xmm0
> -       movdqu  %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> -       lddqu   -40(%rsi), %xmm0
> -       movdqu  %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> -       lddqu   -24(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -24(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_8bytes):
> -       mov     -8(%rsi), %rdx
> -       mov     %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_135bytes):
> -       lddqu   -135(%rsi), %xmm0
> -       movdqu  %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> -       lddqu   -119(%rsi), %xmm0
> -       movdqu  %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> -       lddqu   -103(%rsi), %xmm0
> -       movdqu  %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> -       lddqu   -87(%rsi), %xmm0
> -       movdqu  %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> -       lddqu   -71(%rsi), %xmm0
> -       movdqu  %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> -       lddqu   -55(%rsi), %xmm0
> -       movdqu  %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> -       lddqu   -39(%rsi), %xmm0
> -       movdqu  %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> -       lddqu   -23(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -23(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_7bytes):
> -       mov     -7(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -7(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_134bytes):
> -       lddqu   -134(%rsi), %xmm0
> -       movdqu  %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> -       lddqu   -118(%rsi), %xmm0
> -       movdqu  %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> -       lddqu   -102(%rsi), %xmm0
> -       movdqu  %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> -       lddqu   -86(%rsi), %xmm0
> -       movdqu  %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> -       lddqu   -70(%rsi), %xmm0
> -       movdqu  %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> -       lddqu   -54(%rsi), %xmm0
> -       movdqu  %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> -       lddqu   -38(%rsi), %xmm0
> -       movdqu  %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> -       lddqu   -22(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -22(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_6bytes):
> -       mov     -6(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -6(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_133bytes):
> -       lddqu   -133(%rsi), %xmm0
> -       movdqu  %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> -       lddqu   -117(%rsi), %xmm0
> -       movdqu  %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> -       lddqu   -101(%rsi), %xmm0
> -       movdqu  %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> -       lddqu   -85(%rsi), %xmm0
> -       movdqu  %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> -       lddqu   -69(%rsi), %xmm0
> -       movdqu  %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> -       lddqu   -53(%rsi), %xmm0
> -       movdqu  %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> -       lddqu   -37(%rsi), %xmm0
> -       movdqu  %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> -       lddqu   -21(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -21(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_5bytes):
> -       mov     -5(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -5(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_132bytes):
> -       lddqu   -132(%rsi), %xmm0
> -       movdqu  %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> -       lddqu   -116(%rsi), %xmm0
> -       movdqu  %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> -       lddqu   -100(%rsi), %xmm0
> -       movdqu  %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> -       lddqu   -84(%rsi), %xmm0
> -       movdqu  %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> -       lddqu   -68(%rsi), %xmm0
> -       movdqu  %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> -       lddqu   -52(%rsi), %xmm0
> -       movdqu  %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> -       lddqu   -36(%rsi), %xmm0
> -       movdqu  %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> -       lddqu   -20(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -20(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_4bytes):
> -       mov     -4(%rsi), %edx
> -       mov     %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_131bytes):
> -       lddqu   -131(%rsi), %xmm0
> -       movdqu  %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> -       lddqu   -115(%rsi), %xmm0
> -       movdqu  %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> -       lddqu   -99(%rsi), %xmm0
> -       movdqu  %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> -       lddqu   -83(%rsi), %xmm0
> -       movdqu  %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> -       lddqu   -67(%rsi), %xmm0
> -       movdqu  %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> -       lddqu   -51(%rsi), %xmm0
> -       movdqu  %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> -       lddqu   -35(%rsi), %xmm0
> -       movdqu  %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> -       lddqu   -19(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -19(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_3bytes):
> -       mov     -3(%rsi), %dx
> -       mov     -2(%rsi), %cx
> -       mov     %dx, -3(%rdi)
> -       mov     %cx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_130bytes):
> -       lddqu   -130(%rsi), %xmm0
> -       movdqu  %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> -       lddqu   -114(%rsi), %xmm0
> -       movdqu  %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> -       lddqu   -98(%rsi), %xmm0
> -       movdqu  %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> -       lddqu   -82(%rsi), %xmm0
> -       movdqu  %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> -       lddqu   -66(%rsi), %xmm0
> -       movdqu  %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> -       lddqu   -50(%rsi), %xmm0
> -       movdqu  %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> -       lddqu   -34(%rsi), %xmm0
> -       movdqu  %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> -       lddqu   -18(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -18(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_2bytes):
> -       movzwl  -2(%rsi), %edx
> -       mov     %dx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_129bytes):
> -       lddqu   -129(%rsi), %xmm0
> -       movdqu  %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> -       lddqu   -113(%rsi), %xmm0
> -       movdqu  %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> -       lddqu   -97(%rsi), %xmm0
> -       movdqu  %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> -       lddqu   -81(%rsi), %xmm0
> -       movdqu  %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> -       lddqu   -65(%rsi), %xmm0
> -       movdqu  %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> -       lddqu   -49(%rsi), %xmm0
> -       movdqu  %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> -       lddqu   -33(%rsi), %xmm0
> -       movdqu  %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> -       lddqu   -17(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -17(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_1bytes):
> -       movzbl  -1(%rsi), %edx
> -       mov     %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_128bytes):
> -       lddqu   112(%rsi), %xmm0
> -       movdqu  %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> -       lddqu   96(%rsi), %xmm0
> -       movdqu  %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> -       lddqu   80(%rsi), %xmm0
> -       movdqu  %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> -       lddqu   64(%rsi), %xmm0
> -       movdqu  %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> -       lddqu   48(%rsi), %xmm0
> -       movdqu  %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> -       lddqu   32(%rsi), %xmm0
> -       movdqu  %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> -       lddqu   16(%rsi), %xmm0
> -       movdqu  %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> -       lddqu   (%rsi), %xmm0
> -       movdqu  %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_143bytes):
> -       lddqu   127(%rsi), %xmm0
> -       movdqu  %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> -       lddqu   111(%rsi), %xmm0
> -       movdqu  %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> -       lddqu   95(%rsi), %xmm0
> -       movdqu  %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> -       lddqu   79(%rsi), %xmm0
> -       movdqu  %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> -       lddqu   63(%rsi), %xmm0
> -       movdqu  %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> -       lddqu   47(%rsi), %xmm0
> -       movdqu  %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> -       lddqu   31(%rsi), %xmm0
> -       movdqu  %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> -       lddqu   15(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 15(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -
> -       .p2align 4
> -L(bwd_write_15bytes):
> -       mov     7(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 7(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_142bytes):
> -       lddqu   126(%rsi), %xmm0
> -       movdqu  %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> -       lddqu   110(%rsi), %xmm0
> -       movdqu  %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> -       lddqu   94(%rsi), %xmm0
> -       movdqu  %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> -       lddqu   78(%rsi), %xmm0
> -       movdqu  %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> -       lddqu   62(%rsi), %xmm0
> -       movdqu  %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> -       lddqu   46(%rsi), %xmm0
> -       movdqu  %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> -       lddqu   30(%rsi), %xmm0
> -       movdqu  %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> -       lddqu   14(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 14(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_14bytes):
> -       mov     6(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 6(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_141bytes):
> -       lddqu   125(%rsi), %xmm0
> -       movdqu  %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> -       lddqu   109(%rsi), %xmm0
> -       movdqu  %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> -       lddqu   93(%rsi), %xmm0
> -       movdqu  %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> -       lddqu   77(%rsi), %xmm0
> -       movdqu  %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> -       lddqu   61(%rsi), %xmm0
> -       movdqu  %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> -       lddqu   45(%rsi), %xmm0
> -       movdqu  %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> -       lddqu   29(%rsi), %xmm0
> -       movdqu  %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> -       lddqu   13(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 13(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_13bytes):
> -       mov     5(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 5(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_140bytes):
> -       lddqu   124(%rsi), %xmm0
> -       movdqu  %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> -       lddqu   108(%rsi), %xmm0
> -       movdqu  %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> -       lddqu   92(%rsi), %xmm0
> -       movdqu  %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> -       lddqu   76(%rsi), %xmm0
> -       movdqu  %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> -       lddqu   60(%rsi), %xmm0
> -       movdqu  %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> -       lddqu   44(%rsi), %xmm0
> -       movdqu  %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> -       lddqu   28(%rsi), %xmm0
> -       movdqu  %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> -       lddqu   12(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 12(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_12bytes):
> -       mov     4(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 4(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_139bytes):
> -       lddqu   123(%rsi), %xmm0
> -       movdqu  %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> -       lddqu   107(%rsi), %xmm0
> -       movdqu  %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> -       lddqu   91(%rsi), %xmm0
> -       movdqu  %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> -       lddqu   75(%rsi), %xmm0
> -       movdqu  %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> -       lddqu   59(%rsi), %xmm0
> -       movdqu  %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> -       lddqu   43(%rsi), %xmm0
> -       movdqu  %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> -       lddqu   27(%rsi), %xmm0
> -       movdqu  %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> -       lddqu   11(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 11(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_11bytes):
> -       mov     3(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 3(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_138bytes):
> -       lddqu   122(%rsi), %xmm0
> -       movdqu  %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> -       lddqu   106(%rsi), %xmm0
> -       movdqu  %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> -       lddqu   90(%rsi), %xmm0
> -       movdqu  %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> -       lddqu   74(%rsi), %xmm0
> -       movdqu  %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> -       lddqu   58(%rsi), %xmm0
> -       movdqu  %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> -       lddqu   42(%rsi), %xmm0
> -       movdqu  %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> -       lddqu   26(%rsi), %xmm0
> -       movdqu  %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> -       lddqu   10(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 10(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_10bytes):
> -       mov     2(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 2(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_137bytes):
> -       lddqu   121(%rsi), %xmm0
> -       movdqu  %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> -       lddqu   105(%rsi), %xmm0
> -       movdqu  %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> -       lddqu   89(%rsi), %xmm0
> -       movdqu  %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> -       lddqu   73(%rsi), %xmm0
> -       movdqu  %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> -       lddqu   57(%rsi), %xmm0
> -       movdqu  %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> -       lddqu   41(%rsi), %xmm0
> -       movdqu  %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> -       lddqu   25(%rsi), %xmm0
> -       movdqu  %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> -       lddqu   9(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 9(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_9bytes):
> -       mov     1(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 1(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_136bytes):
> -       lddqu   120(%rsi), %xmm0
> -       movdqu  %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> -       lddqu   104(%rsi), %xmm0
> -       movdqu  %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> -       lddqu   88(%rsi), %xmm0
> -       movdqu  %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> -       lddqu   72(%rsi), %xmm0
> -       movdqu  %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> -       lddqu   56(%rsi), %xmm0
> -       movdqu  %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> -       lddqu   40(%rsi), %xmm0
> -       movdqu  %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> -       lddqu   24(%rsi), %xmm0
> -       movdqu  %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> -       lddqu   8(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 8(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_8bytes):
> -       mov     (%rsi), %rdx
> -       mov     %rdx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_135bytes):
> -       lddqu   119(%rsi), %xmm0
> -       movdqu  %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> -       lddqu   103(%rsi), %xmm0
> -       movdqu  %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> -       lddqu   87(%rsi), %xmm0
> -       movdqu  %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> -       lddqu   71(%rsi), %xmm0
> -       movdqu  %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> -       lddqu   55(%rsi), %xmm0
> -       movdqu  %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> -       lddqu   39(%rsi), %xmm0
> -       movdqu  %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> -       lddqu   23(%rsi), %xmm0
> -       movdqu  %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> -       lddqu   7(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 7(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_7bytes):
> -       mov     3(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 3(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_134bytes):
> -       lddqu   118(%rsi), %xmm0
> -       movdqu  %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> -       lddqu   102(%rsi), %xmm0
> -       movdqu  %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> -       lddqu   86(%rsi), %xmm0
> -       movdqu  %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> -       lddqu   70(%rsi), %xmm0
> -       movdqu  %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> -       lddqu   54(%rsi), %xmm0
> -       movdqu  %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> -       lddqu   38(%rsi), %xmm0
> -       movdqu  %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> -       lddqu   22(%rsi), %xmm0
> -       movdqu  %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> -       lddqu   6(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 6(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_6bytes):
> -       mov     2(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 2(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_133bytes):
> -       lddqu   117(%rsi), %xmm0
> -       movdqu  %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> -       lddqu   101(%rsi), %xmm0
> -       movdqu  %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> -       lddqu   85(%rsi), %xmm0
> -       movdqu  %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> -       lddqu   69(%rsi), %xmm0
> -       movdqu  %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> -       lddqu   53(%rsi), %xmm0
> -       movdqu  %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> -       lddqu   37(%rsi), %xmm0
> -       movdqu  %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> -       lddqu   21(%rsi), %xmm0
> -       movdqu  %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> -       lddqu   5(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 5(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_5bytes):
> -       mov     1(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 1(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_132bytes):
> -       lddqu   116(%rsi), %xmm0
> -       movdqu  %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> -       lddqu   100(%rsi), %xmm0
> -       movdqu  %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> -       lddqu   84(%rsi), %xmm0
> -       movdqu  %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> -       lddqu   68(%rsi), %xmm0
> -       movdqu  %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> -       lddqu   52(%rsi), %xmm0
> -       movdqu  %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> -       lddqu   36(%rsi), %xmm0
> -       movdqu  %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> -       lddqu   20(%rsi), %xmm0
> -       movdqu  %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> -       lddqu   4(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 4(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_4bytes):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_131bytes):
> -       lddqu   115(%rsi), %xmm0
> -       movdqu  %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> -       lddqu   99(%rsi), %xmm0
> -       movdqu  %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> -       lddqu   83(%rsi), %xmm0
> -       movdqu  %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> -       lddqu   67(%rsi), %xmm0
> -       movdqu  %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> -       lddqu   51(%rsi), %xmm0
> -       movdqu  %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> -       lddqu   35(%rsi), %xmm0
> -       movdqu  %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> -       lddqu   19(%rsi), %xmm0
> -       movdqu  %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> -       lddqu   3(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 3(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_3bytes):
> -       mov     1(%rsi), %dx
> -       mov     (%rsi), %cx
> -       mov     %dx, 1(%rdi)
> -       mov     %cx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_130bytes):
> -       lddqu   114(%rsi), %xmm0
> -       movdqu  %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> -       lddqu   98(%rsi), %xmm0
> -       movdqu  %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> -       lddqu   82(%rsi), %xmm0
> -       movdqu  %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> -       lddqu   66(%rsi), %xmm0
> -       movdqu  %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> -       lddqu   50(%rsi), %xmm0
> -       movdqu  %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> -       lddqu   34(%rsi), %xmm0
> -       movdqu  %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> -       lddqu   18(%rsi), %xmm0
> -       movdqu  %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> -       lddqu   2(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 2(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_2bytes):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_129bytes):
> -       lddqu   113(%rsi), %xmm0
> -       movdqu  %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> -       lddqu   97(%rsi), %xmm0
> -       movdqu  %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> -       lddqu   81(%rsi), %xmm0
> -       movdqu  %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> -       lddqu   65(%rsi), %xmm0
> -       movdqu  %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> -       lddqu   49(%rsi), %xmm0
> -       movdqu  %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> -       lddqu   33(%rsi), %xmm0
> -       movdqu  %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> -       lddqu   17(%rsi), %xmm0
> -       movdqu  %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> -       lddqu   1(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 1(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_1bytes):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> -       ret
> -
> -END (MEMCPY)
> -
> -       .section .rodata.ssse3,"a",@progbits
> -       .p2align 3
> -L(table_144_bytes_bwd):
> -       .int    JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> -       .p2align 3
> -L(table_144_bytes_fwd):
> -       .int    JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> -       .p2align 3
> -L(shl_table_fwd):
> -       .int    JMPTBL (L(shl_0), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_1), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_2), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_3), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_4), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_5), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_6), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_7), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_8), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_9), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_10), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_11), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_12), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_13), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_14), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> -       .p2align 3
> -L(shl_table_bwd):
> -       .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY         __memmove_ssse3_back
> -#define MEMCPY_CHK     __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (3 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:42   ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
                     ` (4 subsequent siblings)
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.

This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.

Aside from this function all other SSSE3 functions should be safe to
remove.

The performance is not changed drastically although shows overall
improvements without any major regressions or gains.

bench-memcpy geometric_mean(N=50) New / Original: 0.962

bench-memcpy-random geometric_mean(N=50) New / Original: 0.895

bench-memcpy-large geometric_mean(N=50) New / Original: 0.894

Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.

More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---
 sysdeps/x86_64/multiarch/Makefile        |    1 -
 sysdeps/x86_64/multiarch/memcpy-ssse3.S  | 3151 ----------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S |  386 ++-
 3 files changed, 382 insertions(+), 3156 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..84e4e0f6cb 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,382 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_0_15):
+	cmpl	$8, %edx
+	ja	L(copy_9_15)
+
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 8
+L(copy_9_15):
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	cmpl	$1, %edx
+	jl	L(copy_0_0)
+	movzbl	(%rsi), %ecx
+	je	L(copy_0_1)
+
+	movzwl	-2(%rsi, %rdx), %esi
+	movw	%si, -2(%rdi, %rdx)
+L(copy_0_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3
  2022-04-10  0:42   ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
> generally preferable. memcpy/memmove is one exception where avoiding
> unaligned loads with `palignr` is important for some targets.
>
> This commit replaces memmove-ssse3 with a better optimized are lower
> code footprint verion. As well it aliases memcpy to memmove.
>
> Aside from this function all other SSSE3 functions should be safe to
> remove.
>
> The performance is not changed drastically although shows overall
> improvements without any major regressions or gains.
>
> bench-memcpy geometric_mean(N=50) New / Original: 0.962
>
> bench-memcpy-random geometric_mean(N=50) New / Original: 0.895
>
> bench-memcpy-large geometric_mean(N=50) New / Original: 0.894
>
> Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
> for all results.
>
> More important this saves 7246 bytes of code size in memmove an
> additional 10741 bytes by reusing memmove code for memcpy (total 17987
> bytes saves). As well an additional 896 bytes of rodata for the jump
> table entries.
> ---
>  sysdeps/x86_64/multiarch/Makefile        |    1 -
>  sysdeps/x86_64/multiarch/memcpy-ssse3.S  | 3151 ----------------------
>  sysdeps/x86_64/multiarch/memmove-ssse3.S |  386 ++-
>  3 files changed, 382 insertions(+), 3156 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 303fb5d734..e7ea963fc0 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,6 @@ sysdep_routines += \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
>    memcmpeq-sse2 \
> -  memcpy-ssse3 \
>    memmove-avx-unaligned-erms \
>    memmove-avx-unaligned-erms-rtm \
>    memmove-avx512-no-vzeroupper \
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> deleted file mode 100644
> index 65644d3a09..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> +++ /dev/null
> @@ -1,3151 +0,0 @@
> -/* memcpy with SSSE3
> -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY                __memcpy_ssse3
> -# define MEMCPY_CHK    __memcpy_chk_ssse3
> -# define MEMPCPY       __mempcpy_ssse3
> -# define MEMPCPY_CHK   __mempcpy_chk_ssse3
> -#endif
> -
> -#define JMPTBL(I, B)   I - B
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -   relative offsets.  INDEX is a register contains the index into the
> -   jump table.  SCALE is the scale of INDEX.  */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
> -  lea          TABLE(%rip), %r11;                              \
> -  movslq       (%r11, INDEX, SCALE), INDEX;                    \
> -  lea          (%r11, INDEX), INDEX;                           \
> -  _CET_NOTRACK jmp *INDEX;                                     \
> -  ud2
> -
> -       .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> -       mov     %RDI_LP, %RAX_LP
> -       add     %RDX_LP, %RAX_LP
> -       jmp     L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> -       mov     %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> -       add     %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> -       cmp     %rsi, %rdi
> -       jb      L(copy_forward)
> -       je      L(write_0bytes)
> -       cmp     $79, %rdx
> -       jbe     L(copy_forward)
> -       jmp     L(copy_backward)
> -L(copy_forward):
> -#endif
> -L(start):
> -       cmp     $79, %rdx
> -       lea     L(table_less_80bytes)(%rip), %r11
> -       ja      L(80bytesormore)
> -       movslq  (%r11, %rdx, 4), %r9
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(80bytesormore):
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jle     L(copy_backward)
> -#endif
> -
> -       movdqu  (%rsi), %xmm0
> -       mov     %rdi, %rcx
> -       and     $-16, %rdi
> -       add     $16, %rdi
> -       mov     %rcx, %r8
> -       sub     %rdi, %rcx
> -       add     %rcx, %rdx
> -       sub     %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -       cmp     %rcx, %rdx
> -       mov     %rsi, %r9
> -       ja      L(large_page_fwd)
> -       and     $0xf, %r9
> -       jz      L(shl_0)
> -#ifdef DATA_CACHE_SIZE_HALF
> -       mov     $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> -       BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
> -
> -       .p2align 4
> -L(copy_backward):
> -       movdqu  -16(%rsi, %rdx), %xmm0
> -       add     %rdx, %rsi
> -       lea     -16(%rdi, %rdx), %r8
> -       add     %rdx, %rdi
> -
> -       mov     %rdi, %rcx
> -       and     $0xf, %rcx
> -       xor     %rcx, %rdi
> -       sub     %rcx, %rdx
> -       sub     %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -
> -       cmp     %rcx, %rdx
> -       mov     %rsi, %r9
> -       ja      L(large_page_bwd)
> -       and     $0xf, %r9
> -       jz      L(shl_0_bwd)
> -#ifdef DATA_CACHE_SIZE_HALF
> -       mov     $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> -       BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
> -
> -       .p2align 4
> -L(shl_0):
> -       sub     $16, %rdx
> -       movdqa  (%rsi), %xmm1
> -       add     $16, %rsi
> -       movdqa  %xmm1, (%rdi)
> -       add     $16, %rdi
> -       cmp     $128, %rdx
> -       movdqu  %xmm0, (%r8)
> -       ja      L(shl_0_gobble)
> -       cmp     $64, %rdx
> -       jb      L(shl_0_less_64bytes)
> -       movaps  (%rsi), %xmm4
> -       movaps  16(%rsi), %xmm1
> -       movaps  32(%rsi), %xmm2
> -       movaps  48(%rsi), %xmm3
> -       movaps  %xmm4, (%rdi)
> -       movaps  %xmm1, 16(%rdi)
> -       movaps  %xmm2, 32(%rdi)
> -       movaps  %xmm3, 48(%rdi)
> -       sub     $64, %rdx
> -       add     $64, %rsi
> -       add     $64, %rdi
> -L(shl_0_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble):
> -#ifdef DATA_CACHE_SIZE_HALF
> -       cmp     $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> -       lea     -128(%rdx), %rdx
> -       jae     L(shl_0_gobble_mem_loop)
> -L(shl_0_gobble_cache_loop):
> -       movdqa  (%rsi), %xmm4
> -       movaps  0x10(%rsi), %xmm1
> -       movaps  0x20(%rsi), %xmm2
> -       movaps  0x30(%rsi), %xmm3
> -
> -       movdqa  %xmm4, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -
> -       sub     $128, %rdx
> -       movaps  0x40(%rsi), %xmm4
> -       movaps  0x50(%rsi), %xmm5
> -       movaps  0x60(%rsi), %xmm6
> -       movaps  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -       movaps  %xmm4, 0x40(%rdi)
> -       movaps  %xmm5, 0x50(%rdi)
> -       movaps  %xmm6, 0x60(%rdi)
> -       movaps  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_cache_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_cache_less_64bytes)
> -
> -       movdqa  (%rsi), %xmm4
> -       sub     $0x40, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -
> -       movdqa  %xmm4, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -
> -       movdqa  0x20(%rsi), %xmm4
> -       movdqa  0x30(%rsi), %xmm1
> -       add     $0x40, %rsi
> -
> -       movdqa  %xmm4, 0x20(%rdi)
> -       movdqa  %xmm1, 0x30(%rdi)
> -       add     $0x40, %rdi
> -L(shl_0_cache_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_mem_loop):
> -       prefetcht0 0x1c0(%rsi)
> -       prefetcht0 0x280(%rsi)
> -
> -       movdqa  (%rsi), %xmm0
> -       movdqa  0x10(%rsi), %xmm1
> -       movdqa  0x20(%rsi), %xmm2
> -       movdqa  0x30(%rsi), %xmm3
> -       movdqa  0x40(%rsi), %xmm4
> -       movdqa  0x50(%rsi), %xmm5
> -       movdqa  0x60(%rsi), %xmm6
> -       movdqa  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -       sub     $0x80, %rdx
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       movdqa  %xmm2, 0x20(%rdi)
> -       movdqa  %xmm3, 0x30(%rdi)
> -       movdqa  %xmm4, 0x40(%rdi)
> -       movdqa  %xmm5, 0x50(%rdi)
> -       movdqa  %xmm6, 0x60(%rdi)
> -       movdqa  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_mem_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_mem_less_64bytes)
> -
> -       movdqa  (%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -
> -       movdqa  0x20(%rsi), %xmm0
> -       movdqa  0x30(%rsi), %xmm1
> -       add     $0x40, %rsi
> -
> -       movdqa  %xmm0, 0x20(%rdi)
> -       movdqa  %xmm1, 0x30(%rdi)
> -       add     $0x40, %rdi
> -L(shl_0_mem_less_64bytes):
> -       cmp     $0x20, %rdx
> -       jb      L(shl_0_mem_less_32bytes)
> -       movdqa  (%rsi), %xmm0
> -       sub     $0x20, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -       add     $0x20, %rsi
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       add     $0x20, %rdi
> -L(shl_0_mem_less_32bytes):
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_bwd):
> -       sub     $16, %rdx
> -       movdqa  -0x10(%rsi), %xmm1
> -       sub     $16, %rsi
> -       movdqa  %xmm1, -0x10(%rdi)
> -       sub     $16, %rdi
> -       cmp     $0x80, %rdx
> -       movdqu  %xmm0, (%r8)
> -       ja      L(shl_0_gobble_bwd)
> -       cmp     $64, %rdx
> -       jb      L(shl_0_less_64bytes_bwd)
> -       movaps  -0x10(%rsi), %xmm0
> -       movaps  -0x20(%rsi), %xmm1
> -       movaps  -0x30(%rsi), %xmm2
> -       movaps  -0x40(%rsi), %xmm3
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       sub     $64, %rdx
> -       sub     $0x40, %rsi
> -       sub     $0x40, %rdi
> -L(shl_0_less_64bytes_bwd):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_bwd):
> -#ifdef DATA_CACHE_SIZE_HALF
> -       cmp     $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> -       lea     -128(%rdx), %rdx
> -       jae     L(shl_0_gobble_mem_bwd_loop)
> -L(shl_0_gobble_bwd_loop):
> -       movdqa  -0x10(%rsi), %xmm0
> -       movaps  -0x20(%rsi), %xmm1
> -       movaps  -0x30(%rsi), %xmm2
> -       movaps  -0x40(%rsi), %xmm3
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -
> -       sub     $0x80, %rdx
> -       movaps  -0x50(%rsi), %xmm4
> -       movaps  -0x60(%rsi), %xmm5
> -       movaps  -0x70(%rsi), %xmm6
> -       movaps  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -       movaps  %xmm4, -0x50(%rdi)
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  %xmm6, -0x70(%rdi)
> -       movaps  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_gobble_bwd_less_64bytes)
> -
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -
> -       movdqa  -0x30(%rsi), %xmm0
> -       movdqa  -0x40(%rsi), %xmm1
> -       sub     $0x40, %rsi
> -
> -       movdqa  %xmm0, -0x30(%rdi)
> -       movdqa  %xmm1, -0x40(%rdi)
> -       sub     $0x40, %rdi
> -L(shl_0_gobble_bwd_less_64bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_mem_bwd_loop):
> -       prefetcht0 -0x1c0(%rsi)
> -       prefetcht0 -0x280(%rsi)
> -       movdqa  -0x10(%rsi), %xmm0
> -       movdqa  -0x20(%rsi), %xmm1
> -       movdqa  -0x30(%rsi), %xmm2
> -       movdqa  -0x40(%rsi), %xmm3
> -       movdqa  -0x50(%rsi), %xmm4
> -       movdqa  -0x60(%rsi), %xmm5
> -       movdqa  -0x70(%rsi), %xmm6
> -       movdqa  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -       sub     $0x80, %rdx
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -       movdqa  %xmm2, -0x30(%rdi)
> -       movdqa  %xmm3, -0x40(%rdi)
> -       movdqa  %xmm4, -0x50(%rdi)
> -       movdqa  %xmm5, -0x60(%rdi)
> -       movdqa  %xmm6, -0x70(%rdi)
> -       movdqa  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_mem_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_mem_bwd_less_64bytes)
> -
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -
> -       movdqa  -0x30(%rsi), %xmm0
> -       movdqa  -0x40(%rsi), %xmm1
> -       sub     $0x40, %rsi
> -
> -       movdqa  %xmm0, -0x30(%rdi)
> -       movdqa  %xmm1, -0x40(%rdi)
> -       sub     $0x40, %rdi
> -L(shl_0_mem_bwd_less_64bytes):
> -       cmp     $0x20, %rdx
> -       jb      L(shl_0_mem_bwd_less_32bytes)
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x20, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -       sub     $0x20, %rsi
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -       sub     $0x20, %rdi
> -L(shl_0_mem_bwd_less_32bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1):
> -       lea     (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       jb      L(L1_fwd)
> -       lea     (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
> -L(L1_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_1_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0f(%rsi), %xmm2
> -       movaps  0x1f(%rsi), %xmm3
> -       movaps  0x2f(%rsi), %xmm4
> -       movaps  0x3f(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $1, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $1, %xmm3, %xmm4
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $1, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_1_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1_bwd):
> -       lea     (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       jb      L(L1_bwd)
> -       lea     (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
> -L(L1_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_1_bwd_loop_L1):
> -       movaps  -0x11(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x21(%rsi), %xmm3
> -       movaps  -0x31(%rsi), %xmm4
> -       movaps  -0x41(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $1, %xmm2, %xmm1
> -       palignr $1, %xmm3, %xmm2
> -       palignr $1, %xmm4, %xmm3
> -       palignr $1, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_1_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2):
> -       lea     (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       jb      L(L2_fwd)
> -       lea     (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
> -L(L2_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_2_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0e(%rsi), %xmm2
> -       movaps  0x1e(%rsi), %xmm3
> -       movaps  0x2e(%rsi), %xmm4
> -       movaps  0x3e(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $2, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $2, %xmm3, %xmm4
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $2, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_2_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2_bwd):
> -       lea     (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       jb      L(L2_bwd)
> -       lea     (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
> -L(L2_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_2_bwd_loop_L1):
> -       movaps  -0x12(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x22(%rsi), %xmm3
> -       movaps  -0x32(%rsi), %xmm4
> -       movaps  -0x42(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $2, %xmm2, %xmm1
> -       palignr $2, %xmm3, %xmm2
> -       palignr $2, %xmm4, %xmm3
> -       palignr $2, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_2_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3):
> -       lea     (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x03(%rsi), %xmm1
> -       jb      L(L3_fwd)
> -       lea     (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
> -L(L3_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_3_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0d(%rsi), %xmm2
> -       movaps  0x1d(%rsi), %xmm3
> -       movaps  0x2d(%rsi), %xmm4
> -       movaps  0x3d(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $3, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $3, %xmm3, %xmm4
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $3, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_3_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3_bwd):
> -       lea     (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x03(%rsi), %xmm1
> -       jb      L(L3_bwd)
> -       lea     (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
> -L(L3_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_3_bwd_loop_L1):
> -       movaps  -0x13(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x23(%rsi), %xmm3
> -       movaps  -0x33(%rsi), %xmm4
> -       movaps  -0x43(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $3, %xmm2, %xmm1
> -       palignr $3, %xmm3, %xmm2
> -       palignr $3, %xmm4, %xmm3
> -       palignr $3, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_3_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4):
> -       lea     (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       jb      L(L4_fwd)
> -       lea     (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
> -L(L4_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_4_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0c(%rsi), %xmm2
> -       movaps  0x1c(%rsi), %xmm3
> -       movaps  0x2c(%rsi), %xmm4
> -       movaps  0x3c(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $4, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $4, %xmm3, %xmm4
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $4, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_4_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4_bwd):
> -       lea     (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       jb      L(L4_bwd)
> -       lea     (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
> -L(L4_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_4_bwd_loop_L1):
> -       movaps  -0x14(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x24(%rsi), %xmm3
> -       movaps  -0x34(%rsi), %xmm4
> -       movaps  -0x44(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $4, %xmm2, %xmm1
> -       palignr $4, %xmm3, %xmm2
> -       palignr $4, %xmm4, %xmm3
> -       palignr $4, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_4_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5):
> -       lea     (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       jb      L(L5_fwd)
> -       lea     (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
> -L(L5_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_5_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0b(%rsi), %xmm2
> -       movaps  0x1b(%rsi), %xmm3
> -       movaps  0x2b(%rsi), %xmm4
> -       movaps  0x3b(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $5, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $5, %xmm3, %xmm4
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $5, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_5_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5_bwd):
> -       lea     (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       jb      L(L5_bwd)
> -       lea     (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
> -L(L5_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_5_bwd_loop_L1):
> -       movaps  -0x15(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x25(%rsi), %xmm3
> -       movaps  -0x35(%rsi), %xmm4
> -       movaps  -0x45(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $5, %xmm2, %xmm1
> -       palignr $5, %xmm3, %xmm2
> -       palignr $5, %xmm4, %xmm3
> -       palignr $5, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_5_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6):
> -       lea     (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       jb      L(L6_fwd)
> -       lea     (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
> -L(L6_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_6_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0a(%rsi), %xmm2
> -       movaps  0x1a(%rsi), %xmm3
> -       movaps  0x2a(%rsi), %xmm4
> -       movaps  0x3a(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $6, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $6, %xmm3, %xmm4
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $6, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_6_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6_bwd):
> -       lea     (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       jb      L(L6_bwd)
> -       lea     (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
> -L(L6_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_6_bwd_loop_L1):
> -       movaps  -0x16(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x26(%rsi), %xmm3
> -       movaps  -0x36(%rsi), %xmm4
> -       movaps  -0x46(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $6, %xmm2, %xmm1
> -       palignr $6, %xmm3, %xmm2
> -       palignr $6, %xmm4, %xmm3
> -       palignr $6, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_6_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7):
> -       lea     (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       jb      L(L7_fwd)
> -       lea     (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
> -L(L7_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_7_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x09(%rsi), %xmm2
> -       movaps  0x19(%rsi), %xmm3
> -       movaps  0x29(%rsi), %xmm4
> -       movaps  0x39(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $7, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $7, %xmm3, %xmm4
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $7, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_7_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7_bwd):
> -       lea     (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       jb      L(L7_bwd)
> -       lea     (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
> -L(L7_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_7_bwd_loop_L1):
> -       movaps  -0x17(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x27(%rsi), %xmm3
> -       movaps  -0x37(%rsi), %xmm4
> -       movaps  -0x47(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $7, %xmm2, %xmm1
> -       palignr $7, %xmm3, %xmm2
> -       palignr $7, %xmm4, %xmm3
> -       palignr $7, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_7_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8):
> -       lea     (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       jb      L(L8_fwd)
> -       lea     (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
> -L(L8_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -L(shl_8_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_8_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x08(%rsi), %xmm2
> -       movaps  0x18(%rsi), %xmm3
> -       movaps  0x28(%rsi), %xmm4
> -       movaps  0x38(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $8, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $8, %xmm3, %xmm4
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $8, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_8_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -       .p2align 4
> -L(shl_8_end):
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm4, -0x20(%rdi)
> -       add     %rdx, %rsi
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8_bwd):
> -       lea     (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       jb      L(L8_bwd)
> -       lea     (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
> -L(L8_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_8_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_8_bwd_loop_L1):
> -       movaps  -0x18(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x28(%rsi), %xmm3
> -       movaps  -0x38(%rsi), %xmm4
> -       movaps  -0x48(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $8, %xmm2, %xmm1
> -       palignr $8, %xmm3, %xmm2
> -       palignr $8, %xmm4, %xmm3
> -       palignr $8, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_8_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_8_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9):
> -       lea     (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       jb      L(L9_fwd)
> -       lea     (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
> -L(L9_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_9_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x07(%rsi), %xmm2
> -       movaps  0x17(%rsi), %xmm3
> -       movaps  0x27(%rsi), %xmm4
> -       movaps  0x37(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $9, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $9, %xmm3, %xmm4
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $9, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_9_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9_bwd):
> -       lea     (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       jb      L(L9_bwd)
> -       lea     (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
> -L(L9_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_9_bwd_loop_L1):
> -       movaps  -0x19(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x29(%rsi), %xmm3
> -       movaps  -0x39(%rsi), %xmm4
> -       movaps  -0x49(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $9, %xmm2, %xmm1
> -       palignr $9, %xmm3, %xmm2
> -       palignr $9, %xmm4, %xmm3
> -       palignr $9, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_9_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10):
> -       lea     (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       jb      L(L10_fwd)
> -       lea     (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
> -L(L10_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_10_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x06(%rsi), %xmm2
> -       movaps  0x16(%rsi), %xmm3
> -       movaps  0x26(%rsi), %xmm4
> -       movaps  0x36(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $10, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $10, %xmm3, %xmm4
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $10, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_10_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10_bwd):
> -       lea     (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       jb      L(L10_bwd)
> -       lea     (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
> -L(L10_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_10_bwd_loop_L1):
> -       movaps  -0x1a(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2a(%rsi), %xmm3
> -       movaps  -0x3a(%rsi), %xmm4
> -       movaps  -0x4a(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $10, %xmm2, %xmm1
> -       palignr $10, %xmm3, %xmm2
> -       palignr $10, %xmm4, %xmm3
> -       palignr $10, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_10_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11):
> -       lea     (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       jb      L(L11_fwd)
> -       lea     (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
> -L(L11_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_11_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x05(%rsi), %xmm2
> -       movaps  0x15(%rsi), %xmm3
> -       movaps  0x25(%rsi), %xmm4
> -       movaps  0x35(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $11, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $11, %xmm3, %xmm4
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $11, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_11_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11_bwd):
> -       lea     (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       jb      L(L11_bwd)
> -       lea     (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
> -L(L11_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_11_bwd_loop_L1):
> -       movaps  -0x1b(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2b(%rsi), %xmm3
> -       movaps  -0x3b(%rsi), %xmm4
> -       movaps  -0x4b(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $11, %xmm2, %xmm1
> -       palignr $11, %xmm3, %xmm2
> -       palignr $11, %xmm4, %xmm3
> -       palignr $11, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_11_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12):
> -       lea     (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0c(%rsi), %xmm1
> -       jb      L(L12_fwd)
> -       lea     (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
> -L(L12_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_12_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x04(%rsi), %xmm2
> -       movaps  0x14(%rsi), %xmm3
> -       movaps  0x24(%rsi), %xmm4
> -       movaps  0x34(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $12, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $12, %xmm3, %xmm4
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $12, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_12_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12_bwd):
> -       lea     (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0c(%rsi), %xmm1
> -       jb      L(L12_bwd)
> -       lea     (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
> -L(L12_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_12_bwd_loop_L1):
> -       movaps  -0x1c(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2c(%rsi), %xmm3
> -       movaps  -0x3c(%rsi), %xmm4
> -       movaps  -0x4c(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $12, %xmm2, %xmm1
> -       palignr $12, %xmm3, %xmm2
> -       palignr $12, %xmm4, %xmm3
> -       palignr $12, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_12_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13):
> -       lea     (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       jb      L(L13_fwd)
> -       lea     (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
> -L(L13_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_13_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x03(%rsi), %xmm2
> -       movaps  0x13(%rsi), %xmm3
> -       movaps  0x23(%rsi), %xmm4
> -       movaps  0x33(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $13, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $13, %xmm3, %xmm4
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $13, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_13_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13_bwd):
> -       lea     (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       jb      L(L13_bwd)
> -       lea     (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
> -L(L13_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_13_bwd_loop_L1):
> -       movaps  -0x1d(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2d(%rsi), %xmm3
> -       movaps  -0x3d(%rsi), %xmm4
> -       movaps  -0x4d(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $13, %xmm2, %xmm1
> -       palignr $13, %xmm3, %xmm2
> -       palignr $13, %xmm4, %xmm3
> -       palignr $13, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_13_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14):
> -       lea     (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       jb      L(L14_fwd)
> -       lea     (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
> -L(L14_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_14_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x02(%rsi), %xmm2
> -       movaps  0x12(%rsi), %xmm3
> -       movaps  0x22(%rsi), %xmm4
> -       movaps  0x32(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $14, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $14, %xmm3, %xmm4
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $14, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_14_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14_bwd):
> -       lea     (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       jb      L(L14_bwd)
> -       lea     (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
> -L(L14_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_14_bwd_loop_L1):
> -       movaps  -0x1e(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2e(%rsi), %xmm3
> -       movaps  -0x3e(%rsi), %xmm4
> -       movaps  -0x4e(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $14, %xmm2, %xmm1
> -       palignr $14, %xmm3, %xmm2
> -       palignr $14, %xmm4, %xmm3
> -       palignr $14, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_14_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15):
> -       lea     (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       jb      L(L15_fwd)
> -       lea     (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
> -L(L15_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_15_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x01(%rsi), %xmm2
> -       movaps  0x11(%rsi), %xmm3
> -       movaps  0x21(%rsi), %xmm4
> -       movaps  0x31(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $15, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $15, %xmm3, %xmm4
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $15, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_15_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15_bwd):
> -       lea     (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       jb      L(L15_bwd)
> -       lea     (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
> -L(L15_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_15_bwd_loop_L1):
> -       movaps  -0x1f(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2f(%rsi), %xmm3
> -       movaps  -0x3f(%rsi), %xmm4
> -       movaps  -0x4f(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $15, %xmm2, %xmm1
> -       palignr $15, %xmm3, %xmm2
> -       palignr $15, %xmm4, %xmm3
> -       palignr $15, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_15_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(write_72bytes):
> -       movdqu  -72(%rsi), %xmm0
> -       movdqu  -56(%rsi), %xmm1
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rcx
> -       movdqu   %xmm0, -72(%rdi)
> -       movdqu   %xmm1, -56(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_64bytes):
> -       movdqu  -64(%rsi), %xmm0
> -       mov     -48(%rsi), %rcx
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -64(%rdi)
> -       mov      %rcx, -48(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_56bytes):
> -       movdqu  -56(%rsi), %xmm0
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rcx
> -       movdqu   %xmm0, -56(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_48bytes):
> -       mov     -48(%rsi), %rcx
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -48(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_40bytes):
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_32bytes):
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_24bytes):
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_16bytes):
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_8bytes):
> -       mov     -8(%rsi), %rdx
> -       mov      %rdx, -8(%rdi)
> -L(write_0bytes):
> -       ret
> -
> -       .p2align 4
> -L(write_73bytes):
> -       movdqu  -73(%rsi), %xmm0
> -       movdqu  -57(%rsi), %xmm1
> -       mov     -41(%rsi), %rcx
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %r8
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -73(%rdi)
> -       movdqu   %xmm1, -57(%rdi)
> -       mov      %rcx, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %r8, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_65bytes):
> -       movdqu  -65(%rsi), %xmm0
> -       movdqu  -49(%rsi), %xmm1
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -65(%rdi)
> -       movdqu   %xmm1, -49(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_57bytes):
> -       movdqu  -57(%rsi), %xmm0
> -       mov     -41(%rsi), %r8
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -57(%rdi)
> -       mov      %r8, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_49bytes):
> -       movdqu  -49(%rsi), %xmm0
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -49(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_41bytes):
> -       mov     -41(%rsi), %r8
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r8, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_33bytes):
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_25bytes):
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_17bytes):
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_9bytes):
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_1bytes):
> -       mov     -1(%rsi), %dl
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_74bytes):
> -       movdqu  -74(%rsi), %xmm0
> -       movdqu  -58(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -74(%rdi)
> -       movdqu   %xmm1, -58(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_66bytes):
> -       movdqu  -66(%rsi), %xmm0
> -       movdqu  -50(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -66(%rdi)
> -       movdqu   %xmm1, -50(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_58bytes):
> -       movdqu  -58(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm1, -58(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_50bytes):
> -       movdqu  -50(%rsi), %xmm0
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -50(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_42bytes):
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_34bytes):
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_26bytes):
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_18bytes):
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_10bytes):
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_2bytes):
> -       mov     -2(%rsi), %dx
> -       mov      %dx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_75bytes):
> -       movdqu  -75(%rsi), %xmm0
> -       movdqu  -59(%rsi), %xmm1
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -75(%rdi)
> -       movdqu   %xmm1, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_67bytes):
> -       movdqu  -67(%rsi), %xmm0
> -       movdqu  -59(%rsi), %xmm1
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -67(%rdi)
> -       movdqu   %xmm1, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_59bytes):
> -       movdqu  -59(%rsi), %xmm0
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_51bytes):
> -       movdqu  -51(%rsi), %xmm0
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -51(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_43bytes):
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_35bytes):
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_27bytes):
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_19bytes):
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_11bytes):
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_3bytes):
> -       mov     -3(%rsi), %dx
> -       mov     -2(%rsi), %cx
> -       mov      %dx, -3(%rdi)
> -       mov      %cx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_76bytes):
> -       movdqu  -76(%rsi), %xmm0
> -       movdqu  -60(%rsi), %xmm1
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -76(%rdi)
> -       movdqu   %xmm1, -60(%rdi)
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_68bytes):
> -       movdqu  -68(%rsi), %xmm0
> -       movdqu  -52(%rsi), %xmm1
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -68(%rdi)
> -       movdqu   %xmm1, -52(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_60bytes):
> -       movdqu  -60(%rsi), %xmm0
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -60(%rdi)
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_52bytes):
> -       movdqu  -52(%rsi), %xmm0
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -52(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_44bytes):
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_36bytes):
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_28bytes):
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_20bytes):
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_12bytes):
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_4bytes):
> -       mov     -4(%rsi), %edx
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_77bytes):
> -       movdqu  -77(%rsi), %xmm0
> -       movdqu  -61(%rsi), %xmm1
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -77(%rdi)
> -       movdqu   %xmm1, -61(%rdi)
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_69bytes):
> -       movdqu  -69(%rsi), %xmm0
> -       movdqu  -53(%rsi), %xmm1
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -69(%rdi)
> -       movdqu   %xmm1, -53(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_61bytes):
> -       movdqu  -61(%rsi), %xmm0
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -61(%rdi)
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_53bytes):
> -       movdqu  -53(%rsi), %xmm0
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -53(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_45bytes):
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_37bytes):
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_29bytes):
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_21bytes):
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_13bytes):
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_5bytes):
> -       mov     -5(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -5(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_78bytes):
> -       movdqu  -78(%rsi), %xmm0
> -       movdqu  -62(%rsi), %xmm1
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -78(%rdi)
> -       movdqu   %xmm1, -62(%rdi)
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_70bytes):
> -       movdqu  -70(%rsi), %xmm0
> -       movdqu  -54(%rsi), %xmm1
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -70(%rdi)
> -       movdqu   %xmm1, -54(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_62bytes):
> -       movdqu  -62(%rsi), %xmm0
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -62(%rdi)
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_54bytes):
> -       movdqu  -54(%rsi), %xmm0
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -54(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_46bytes):
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_38bytes):
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_30bytes):
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_22bytes):
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_14bytes):
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_6bytes):
> -       mov     -6(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -6(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_79bytes):
> -       movdqu  -79(%rsi), %xmm0
> -       movdqu  -63(%rsi), %xmm1
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -79(%rdi)
> -       movdqu   %xmm1, -63(%rdi)
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_71bytes):
> -       movdqu  -71(%rsi), %xmm0
> -       movdqu  -55(%rsi), %xmm1
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -71(%rdi)
> -       movdqu   %xmm1, -55(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_63bytes):
> -       movdqu  -63(%rsi), %xmm0
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -63(%rdi)
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_55bytes):
> -       movdqu  -55(%rsi), %xmm0
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -55(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_47bytes):
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_39bytes):
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_31bytes):
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_23bytes):
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_15bytes):
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_7bytes):
> -       mov     -7(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -7(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(large_page_fwd):
> -       movdqu  (%rsi), %xmm1
> -       lea     16(%rsi), %rsi
> -       movdqu  %xmm0, (%r8)
> -       movntdq %xmm1, (%rdi)
> -       lea     16(%rdi), %rdi
> -       lea     -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rsi, %r9
> -       sub     %rdi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_fwd)
> -       shl     $2, %rcx
> -       cmp     %rcx, %rdx
> -       jb      L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -L(large_page_loop):
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       movntdq %xmm4, 0x40(%rdi)
> -       movntdq %xmm5, 0x50(%rdi)
> -       movntdq %xmm6, 0x60(%rdi)
> -       movntdq %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(large_page_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_less_64bytes)
> -
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       lea     0x40(%rsi), %rsi
> -
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       lea     0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       sfence
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> -       .p2align 4
> -L(ll_cache_copy_fwd_start):
> -       prefetcht0 0x1c0(%rsi)
> -       prefetcht0 0x200(%rsi)
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movaps  %xmm0, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -       movaps  %xmm4, 0x40(%rdi)
> -       movaps  %xmm5, 0x50(%rdi)
> -       movaps  %xmm6, 0x60(%rdi)
> -       movaps  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(ll_cache_copy_fwd_start)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_ll_less_fwd_64bytes)
> -
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       lea     0x40(%rsi), %rsi
> -
> -       movaps  %xmm0, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -       lea     0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_ll_less_fwd_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#endif
> -       .p2align 4
> -L(large_page_bwd):
> -       movdqu  -0x10(%rsi), %xmm1
> -       lea     -16(%rsi), %rsi
> -       movdqu  %xmm0, (%r8)
> -       movdqa  %xmm1, -0x10(%rdi)
> -       lea     -16(%rdi), %rdi
> -       lea     -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rdi, %r9
> -       sub     %rsi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_bwd)
> -       cmp     %rcx, %r9
> -       jb      L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -L(large_page_bwd_loop):
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       movdqu  -0x50(%rsi), %xmm4
> -       movdqu  -0x60(%rsi), %xmm5
> -       movdqu  -0x70(%rsi), %xmm6
> -       movdqu  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movntdq %xmm0, -0x10(%rdi)
> -       movntdq %xmm1, -0x20(%rdi)
> -       movntdq %xmm2, -0x30(%rdi)
> -       movntdq %xmm3, -0x40(%rdi)
> -       movntdq %xmm4, -0x50(%rdi)
> -       movntdq %xmm5, -0x60(%rdi)
> -       movntdq %xmm6, -0x70(%rdi)
> -       movntdq %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(large_page_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_less_bwd_64bytes)
> -
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       lea     -0x40(%rsi), %rsi
> -
> -       movntdq %xmm0, -0x10(%rdi)
> -       movntdq %xmm1, -0x20(%rdi)
> -       movntdq %xmm2, -0x30(%rdi)
> -       movntdq %xmm3, -0x40(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_less_bwd_64bytes):
> -       sfence
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> -       .p2align 4
> -L(ll_cache_copy_bwd_start):
> -       prefetcht0 -0x1c0(%rsi)
> -       prefetcht0 -0x200(%rsi)
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       movdqu  -0x50(%rsi), %xmm4
> -       movdqu  -0x60(%rsi), %xmm5
> -       movdqu  -0x70(%rsi), %xmm6
> -       movdqu  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       movaps  %xmm4, -0x50(%rdi)
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  %xmm6, -0x70(%rdi)
> -       movaps  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(ll_cache_copy_bwd_start)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_ll_less_bwd_64bytes)
> -
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       lea     -0x40(%rsi), %rsi
> -
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_ll_less_bwd_64bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -#endif
> -
> -END (MEMCPY)
> -
> -       .section .rodata.ssse3,"a",@progbits
> -       .p2align 3
> -L(table_less_80bytes):
> -       .int    JMPTBL (L(write_0bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_1bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_2bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_3bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_4bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_5bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_6bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_7bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_8bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_9bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_10bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_11bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_12bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_13bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_14bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_15bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_16bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_17bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_18bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_19bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_20bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_21bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_22bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_23bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_24bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_25bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_26bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_27bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_28bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_29bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_30bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_31bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_32bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_33bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_34bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_35bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_36bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_37bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_38bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_39bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_40bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_41bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_42bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_43bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_44bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_45bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_46bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_47bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_48bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_49bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_50bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_51bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_52bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_53bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_54bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_55bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_56bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_57bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_58bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_59bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_60bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_61bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_62bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_63bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_64bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_65bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_66bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_67bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_68bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_69bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_70bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_71bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_72bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_73bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_74bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_75bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_76bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_77bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_78bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_79bytes), L(table_less_80bytes))
> -
> -       .p2align 3
> -L(shl_table):
> -       .int    JMPTBL (L(shl_0), L(shl_table))
> -       .int    JMPTBL (L(shl_1), L(shl_table))
> -       .int    JMPTBL (L(shl_2), L(shl_table))
> -       .int    JMPTBL (L(shl_3), L(shl_table))
> -       .int    JMPTBL (L(shl_4), L(shl_table))
> -       .int    JMPTBL (L(shl_5), L(shl_table))
> -       .int    JMPTBL (L(shl_6), L(shl_table))
> -       .int    JMPTBL (L(shl_7), L(shl_table))
> -       .int    JMPTBL (L(shl_8), L(shl_table))
> -       .int    JMPTBL (L(shl_9), L(shl_table))
> -       .int    JMPTBL (L(shl_10), L(shl_table))
> -       .int    JMPTBL (L(shl_11), L(shl_table))
> -       .int    JMPTBL (L(shl_12), L(shl_table))
> -       .int    JMPTBL (L(shl_13), L(shl_table))
> -       .int    JMPTBL (L(shl_14), L(shl_table))
> -       .int    JMPTBL (L(shl_15), L(shl_table))
> -
> -       .p2align 3
> -L(shl_table_bwd):
> -       .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> index 295430b1ef..84e4e0f6cb 100644
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
> +++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> @@ -1,4 +1,382 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY         __memmove_ssse3
> -#define MEMCPY_CHK     __memmove_chk_ssse3
> -#include "memcpy-ssse3.S"
> +#include <sysdep.h>
> +
> +#ifndef MEMMOVE
> +# define MEMMOVE       __memmove_ssse3
> +# define MEMMOVE_CHK   __memmove_chk_ssse3
> +# define MEMCPY        __memcpy_ssse3
> +# define MEMCPY_CHK    __memcpy_chk_ssse3
> +# define MEMPCPY       __mempcpy_ssse3
> +# define MEMPCPY_CHK   __mempcpy_chk_ssse3
> +#endif
> +
> +       .section .text.ssse3, "ax", @progbits
> +ENTRY(MEMPCPY_CHK)
> +       cmp     %RDX_LP, %RCX_LP
> +       jb      HIDDEN_JUMPTARGET(__chk_fail)
> +END(MEMPCPY_CHK)
> +
> +ENTRY(MEMPCPY)
> +       mov     %RDI_LP, %RAX_LP
> +       add     %RDX_LP, %RAX_LP
> +       jmp     L(start)
> +END(MEMPCPY)
> +
> +ENTRY(MEMMOVE_CHK)
> +       cmp     %RDX_LP, %RCX_LP
> +       jb      HIDDEN_JUMPTARGET(__chk_fail)
> +END(MEMMOVE_CHK)
> +
> +ENTRY_P2ALIGN(MEMMOVE, 6)
> +       movq    %rdi, %rax
> +L(start):
> +       cmpq    $16, %rdx
> +       jb      L(copy_0_15)
> +
> +       /* These loads are always useful.  */
> +       movups  0(%rsi), %xmm0
> +       movups  -16(%rsi, %rdx), %xmm7
> +       cmpq    $32, %rdx
> +       ja      L(more_2x_vec)
> +
> +       movups  %xmm0, 0(%rdi)
> +       movups  %xmm7, -16(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_4x_vec):
> +       movups  16(%rsi), %xmm1
> +       movups  -32(%rsi, %rdx), %xmm2
> +
> +       movups  %xmm0, 0(%rdi)
> +       movups  %xmm1, 16(%rdi)
> +       movups  %xmm2, -32(%rdi, %rdx)
> +       movups  %xmm7, -16(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_0_15):
> +       cmpl    $8, %edx
> +       ja      L(copy_9_15)
> +
> +       cmpl    $4, %edx
> +       jb      L(copy_0_3)
> +
> +       movl    0(%rsi), %ecx
> +       movl    -4(%rsi, %rdx), %esi
> +       movl    %ecx, 0(%rdi)
> +       movl    %esi, -4(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 8
> +L(copy_9_15):
> +       movq    0(%rsi), %rcx
> +       movq    -8(%rsi, %rdx), %rsi
> +       movq    %rcx, 0(%rdi)
> +       movq    %rsi, -8(%rdi, %rdx)
> +       ret
> +
> +       .p2align 4,, 4
> +L(copy_0_3):
> +       cmpl    $1, %edx
> +       jl      L(copy_0_0)
> +       movzbl  (%rsi), %ecx
> +       je      L(copy_0_1)
> +
> +       movzwl  -2(%rsi, %rdx), %esi
> +       movw    %si, -2(%rdi, %rdx)
> +L(copy_0_1):
> +       movb    %cl, (%rdi)
> +L(copy_0_0):
> +L(nop):
> +       ret
> +
> +       .p2align 4
> +L(more_2x_vec):
> +       cmpq    $64, %rdx
> +       jbe     L(copy_4x_vec)
> +
> +       /* We use rcx later to get alignr value.  */
> +       movq    %rdi, %rcx
> +
> +       /* Backward copy for overlap + dst > src for memmove safety.  */
> +       subq    %rsi, %rcx
> +       cmpq    %rdx, %rcx
> +       jb      L(copy_backward)
> +
> +       /* Load tail.  */
> +
> +       /* -16(%rsi, %rdx) already loaded into xmm7.  */
> +       movups  -32(%rsi, %rdx), %xmm8
> +       movups  -48(%rsi, %rdx), %xmm9
> +
> +       /* Get misalignment.  */
> +       andl    $0xf, %ecx
> +
> +       movq    %rsi, %r9
> +       addq    %rcx, %rsi
> +       andq    $-16, %rsi
> +       /* Get first vec for `palignr`.  */
> +       movaps  (%rsi), %xmm1
> +
> +       /* We have loaded (%rsi) so safe to do this store before the
> +          loop.  */
> +       movups  %xmm0, (%rdi)
> +
> +#ifdef SHARED_CACHE_SIZE_HALF
> +       cmp     $SHARED_CACHE_SIZE_HALF, %RDX_LP
> +#else
> +       cmp     __x86_shared_cache_size_half(%rip), %rdx
> +#endif
> +       ja      L(large_memcpy)
> +
> +       leaq    -64(%rdi, %rdx), %r8
> +       andq    $-16, %rdi
> +       movl    $48, %edx
> +
> +       leaq    L(loop_fwd_start)(%rip), %r9
> +       sall    $6, %ecx
> +       addq    %r9, %rcx
> +       jmp     * %rcx
> +
> +       .p2align 4,, 8
> +L(copy_backward):
> +       testq   %rcx, %rcx
> +       jz      L(nop)
> +
> +       /* Preload tail.  */
> +
> +       /* (%rsi) already loaded into xmm0.  */
> +       movups  16(%rsi), %xmm4
> +       movups  32(%rsi), %xmm5
> +
> +       movq    %rdi, %r8
> +       subq    %rdi, %rsi
> +       leaq    -49(%rdi, %rdx), %rdi
> +       andq    $-16, %rdi
> +       addq    %rdi, %rsi
> +       andq    $-16, %rsi
> +
> +       movaps  48(%rsi), %xmm6
> +
> +
> +       leaq    L(loop_bkwd_start)(%rip), %r9
> +       andl    $0xf, %ecx
> +       sall    $6, %ecx
> +       addq    %r9, %rcx
> +       jmp     * %rcx
> +
> +       .p2align 4,, 8
> +L(large_memcpy):
> +       movups  -64(%r9, %rdx), %xmm10
> +       movups  -80(%r9, %rdx), %xmm11
> +
> +       sall    $5, %ecx
> +       leal    (%rcx, %rcx, 2), %r8d
> +       leaq    -96(%rdi, %rdx), %rcx
> +       andq    $-16, %rdi
> +       leaq    L(large_loop_fwd_start)(%rip), %rdx
> +       addq    %r8, %rdx
> +       jmp     * %rdx
> +
> +
> +       /* Instead of a typical jump table all 16 loops are exactly
> +          64-bytes in size. So, we can just jump to first loop + r8 *
> +          64. Before modifying any loop ensure all their sizes match!
> +        */
> +       .p2align 6
> +L(loop_fwd_start):
> +L(loop_fwd_0x0):
> +       movaps  16(%rsi), %xmm1
> +       movaps  32(%rsi), %xmm2
> +       movaps  48(%rsi), %xmm3
> +       movaps  %xmm1, 16(%rdi)
> +       movaps  %xmm2, 32(%rdi)
> +       movaps  %xmm3, 48(%rdi)
> +       addq    %rdx, %rdi
> +       addq    %rdx, %rsi
> +       cmpq    %rdi, %r8
> +       ja      L(loop_fwd_0x0)
> +L(end_loop_fwd):
> +       movups  %xmm9, 16(%r8)
> +       movups  %xmm8, 32(%r8)
> +       movups  %xmm7, 48(%r8)
> +       ret
> +
> +       /* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
> +          60 bytes otherwise.  */
> +#define ALIGNED_LOOP_FWD(align_by);    \
> +       .p2align 6;     \
> +L(loop_fwd_ ## align_by):      \
> +       movaps  16(%rsi), %xmm0;        \
> +       movaps  32(%rsi), %xmm2;        \
> +       movaps  48(%rsi), %xmm3;        \
> +       movaps  %xmm3, %xmm4;   \
> +       palignr $align_by, %xmm2, %xmm3;        \
> +       palignr $align_by, %xmm0, %xmm2;        \
> +       palignr $align_by, %xmm1, %xmm0;        \
> +       movaps  %xmm4, %xmm1;   \
> +       movaps  %xmm0, 16(%rdi);        \
> +       movaps  %xmm2, 32(%rdi);        \
> +       movaps  %xmm3, 48(%rdi);        \
> +       addq    %rdx, %rdi;     \
> +       addq    %rdx, %rsi;     \
> +       cmpq    %rdi, %r8;      \
> +       ja      L(loop_fwd_ ## align_by);       \
> +       jmp     L(end_loop_fwd);
> +
> +       /* Must be in descending order.  */
> +       ALIGNED_LOOP_FWD (0xf)
> +       ALIGNED_LOOP_FWD (0xe)
> +       ALIGNED_LOOP_FWD (0xd)
> +       ALIGNED_LOOP_FWD (0xc)
> +       ALIGNED_LOOP_FWD (0xb)
> +       ALIGNED_LOOP_FWD (0xa)
> +       ALIGNED_LOOP_FWD (0x9)
> +       ALIGNED_LOOP_FWD (0x8)
> +       ALIGNED_LOOP_FWD (0x7)
> +       ALIGNED_LOOP_FWD (0x6)
> +       ALIGNED_LOOP_FWD (0x5)
> +       ALIGNED_LOOP_FWD (0x4)
> +       ALIGNED_LOOP_FWD (0x3)
> +       ALIGNED_LOOP_FWD (0x2)
> +       ALIGNED_LOOP_FWD (0x1)
> +
> +       .p2align 6
> +L(large_loop_fwd_start):
> +L(large_loop_fwd_0x0):
> +       movaps  16(%rsi), %xmm1
> +       movaps  32(%rsi), %xmm2
> +       movaps  48(%rsi), %xmm3
> +       movaps  64(%rsi), %xmm4
> +       movaps  80(%rsi), %xmm5
> +       movntps %xmm1, 16(%rdi)
> +       movntps %xmm2, 32(%rdi)
> +       movntps %xmm3, 48(%rdi)
> +       movntps %xmm4, 64(%rdi)
> +       movntps %xmm5, 80(%rdi)
> +       addq    $80, %rdi
> +       addq    $80, %rsi
> +       cmpq    %rdi, %rcx
> +       ja      L(large_loop_fwd_0x0)
> +
> +       /* Ensure no icache line split on tail.  */
> +       .p2align 4
> +L(end_large_loop_fwd):
> +       sfence
> +       movups  %xmm11, 16(%rcx)
> +       movups  %xmm10, 32(%rcx)
> +       movups  %xmm9, 48(%rcx)
> +       movups  %xmm8, 64(%rcx)
> +       movups  %xmm7, 80(%rcx)
> +       ret
> +
> +
> +       /* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
> +          96-byte spacing between each.  */
> +#define ALIGNED_LARGE_LOOP_FWD(align_by);      \
> +       .p2align 5;     \
> +L(large_loop_fwd_ ## align_by):        \
> +       movaps  16(%rsi), %xmm0;        \
> +       movaps  32(%rsi), %xmm2;        \
> +       movaps  48(%rsi), %xmm3;        \
> +       movaps  64(%rsi), %xmm4;        \
> +       movaps  80(%rsi), %xmm5;        \
> +       movaps  %xmm5, %xmm6;   \
> +       palignr $align_by, %xmm4, %xmm5;        \
> +       palignr $align_by, %xmm3, %xmm4;        \
> +       palignr $align_by, %xmm2, %xmm3;        \
> +       palignr $align_by, %xmm0, %xmm2;        \
> +       palignr $align_by, %xmm1, %xmm0;        \
> +       movaps  %xmm6, %xmm1;   \
> +       movntps %xmm0, 16(%rdi);        \
> +       movntps %xmm2, 32(%rdi);        \
> +       movntps %xmm3, 48(%rdi);        \
> +       movntps %xmm4, 64(%rdi);        \
> +       movntps %xmm5, 80(%rdi);        \
> +       addq    $80, %rdi;      \
> +       addq    $80, %rsi;      \
> +       cmpq    %rdi, %rcx;     \
> +       ja      L(large_loop_fwd_ ## align_by); \
> +       jmp     L(end_large_loop_fwd);
> +
> +       /* Must be in descending order.  */
> +       ALIGNED_LARGE_LOOP_FWD (0xf)
> +       ALIGNED_LARGE_LOOP_FWD (0xe)
> +       ALIGNED_LARGE_LOOP_FWD (0xd)
> +       ALIGNED_LARGE_LOOP_FWD (0xc)
> +       ALIGNED_LARGE_LOOP_FWD (0xb)
> +       ALIGNED_LARGE_LOOP_FWD (0xa)
> +       ALIGNED_LARGE_LOOP_FWD (0x9)
> +       ALIGNED_LARGE_LOOP_FWD (0x8)
> +       ALIGNED_LARGE_LOOP_FWD (0x7)
> +       ALIGNED_LARGE_LOOP_FWD (0x6)
> +       ALIGNED_LARGE_LOOP_FWD (0x5)
> +       ALIGNED_LARGE_LOOP_FWD (0x4)
> +       ALIGNED_LARGE_LOOP_FWD (0x3)
> +       ALIGNED_LARGE_LOOP_FWD (0x2)
> +       ALIGNED_LARGE_LOOP_FWD (0x1)
> +
> +
> +       .p2align 6
> +L(loop_bkwd_start):
> +L(loop_bkwd_0x0):
> +       movaps  32(%rsi), %xmm1
> +       movaps  16(%rsi), %xmm2
> +       movaps  0(%rsi), %xmm3
> +       movaps  %xmm1, 32(%rdi)
> +       movaps  %xmm2, 16(%rdi)
> +       movaps  %xmm3, 0(%rdi)
> +       subq    $48, %rdi
> +       subq    $48, %rsi
> +       cmpq    %rdi, %r8
> +       jb      L(loop_bkwd_0x0)
> +L(end_loop_bkwd):
> +       movups  %xmm7, -16(%r8, %rdx)
> +       movups  %xmm0, 0(%r8)
> +       movups  %xmm4, 16(%r8)
> +       movups  %xmm5, 32(%r8)
> +
> +       ret
> +
> +
> +       /* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
> +          60 bytes otherwise.  */
> +#define ALIGNED_LOOP_BKWD(align_by);   \
> +       .p2align 6;     \
> +L(loop_bkwd_ ## align_by):     \
> +       movaps  32(%rsi), %xmm1;        \
> +       movaps  16(%rsi), %xmm2;        \
> +       movaps  0(%rsi), %xmm3; \
> +       palignr $align_by, %xmm1, %xmm6;        \
> +       palignr $align_by, %xmm2, %xmm1;        \
> +       palignr $align_by, %xmm3, %xmm2;        \
> +       movaps  %xmm6, 32(%rdi);        \
> +       movaps  %xmm1, 16(%rdi);        \
> +       movaps  %xmm2, 0(%rdi); \
> +       subq    $48, %rdi;      \
> +       subq    $48, %rsi;      \
> +       movaps  %xmm3, %xmm6;   \
> +       cmpq    %rdi, %r8;      \
> +       jb      L(loop_bkwd_ ## align_by);      \
> +       jmp     L(end_loop_bkwd);
> +
> +       /* Must be in descending order.  */
> +       ALIGNED_LOOP_BKWD (0xf)
> +       ALIGNED_LOOP_BKWD (0xe)
> +       ALIGNED_LOOP_BKWD (0xd)
> +       ALIGNED_LOOP_BKWD (0xc)
> +       ALIGNED_LOOP_BKWD (0xb)
> +       ALIGNED_LOOP_BKWD (0xa)
> +       ALIGNED_LOOP_BKWD (0x9)
> +       ALIGNED_LOOP_BKWD (0x8)
> +       ALIGNED_LOOP_BKWD (0x7)
> +       ALIGNED_LOOP_BKWD (0x6)
> +       ALIGNED_LOOP_BKWD (0x5)
> +       ALIGNED_LOOP_BKWD (0x4)
> +       ALIGNED_LOOP_BKWD (0x3)
> +       ALIGNED_LOOP_BKWD (0x2)
> +       ALIGNED_LOOP_BKWD (0x1)
> +END(MEMMOVE)
> +
> +strong_alias (MEMMOVE, MEMCPY)
> +strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (4 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:42   ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
                     ` (3 subsequent siblings)
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

New code save size (-303 bytes) and has significantly better
performance.

geometric_mean(N=20) of page cross cases New / Original: 0.634
---
 sysdeps/x86_64/memcmp.S                  | 884 ++++++++++++++---------
 sysdeps/x86_64/memcmpeq.S                |   2 +-
 sysdeps/x86_64/multiarch/Makefile        |   2 +-
 sysdeps/x86_64/multiarch/memcmp-sse2.S   |   4 +-
 sysdeps/x86_64/multiarch/memcmpeq-sse2.S |   4 +-
 sysdeps/x86_64/multiarch/wmemcmp-c.c     |   9 -
 sysdeps/x86_64/multiarch/wmemcmp-sse2.S  |  25 +
 sysdeps/x86_64/wmemcmp.S                 |  21 +
 8 files changed, 575 insertions(+), 376 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
 create mode 100644 sysdeps/x86_64/wmemcmp.S

diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
index e02a53ea1e..b153694048 100644
--- a/sysdeps/x86_64/memcmp.S
+++ b/sysdeps/x86_64/memcmp.S
@@ -18,395 +18,557 @@
 
 #include <sysdep.h>
 
-	.text
-ENTRY (memcmp)
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	movl	%edx, %edx
+#ifdef USE_AS_WMEMCMP
+# define PCMPEQ	pcmpeqd
+# define CHAR_SIZE	4
+# define SIZE_OFFSET	(0)
+#else
+# define PCMPEQ	pcmpeqb
+# define CHAR_SIZE	1
 #endif
-	test	%RDX_LP, %RDX_LP
-	jz	L(finz)
-	cmpq	$1, %rdx
-	jbe	L(finr1b)
-	subq	%rdi, %rsi
-	movq	%rdx, %r10
-	cmpq	$32, %r10
-	jae	L(gt32)
-	/* Handle small chunks and last block of less than 32 bytes.  */
-L(small):
-	testq	$1, %r10
-	jz	L(s2b)
-	movzbl	(%rdi),	%eax
-	movzbl	(%rdi, %rsi), %edx
-	subq    $1, %r10
-	je	L(finz1)
-	addq	$1, %rdi
-	subl	%edx, %eax
-	jnz	L(exit)
-L(s2b):
-	testq	$2, %r10
-	jz	L(s4b)
-	movzwl	(%rdi),	%eax
-	movzwl	(%rdi, %rsi), %edx
-	subq    $2, %r10
+
 #ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+# define SIZE_OFFSET	(0)
+# define CHECK_CMP(x, y)	subl x, y
 #else
-	je	L(fin2_7)
+# ifndef SIZE_OFFSET
+#  define SIZE_OFFSET	(CHAR_PER_VEC * 2)
+# endif
+# define CHECK_CMP(x, y)	cmpl x, y
 #endif
-	addq	$2, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+
+#define VEC_SIZE	16
+#define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+#ifndef MEMCMP
+# define MEMCMP	memcmp
+#endif
+
+	.text
+ENTRY(MEMCMP)
+#ifdef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
+#endif
+	cmpq	$CHAR_PER_VEC, %rdx
+	ja	L(more_1x_vec)
+
+#ifdef USE_AS_WMEMCMP
+	/* saves a byte of code keeping the fall through path n = [2, 4]
+	   in the initial cache line.  */
+	decl	%edx
+	jle	L(cmp_0_1)
+
+	movq	(%rsi), %xmm0
+	movq	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+
+	movq	-4(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movq	-4(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_end_0_adj)
 #else
-	jnz	L(fin2_7)
+	cmpl	$8, %edx
+	ja	L(cmp_9_16)
+
+	cmpl	$4, %edx
+	jb	L(cmp_0_3)
+
+# ifdef USE_AS_MEMCMPEQ
+	movl	(%rsi), %eax
+	subl	(%rdi), %eax
+
+	movl	-4(%rsi, %rdx), %esi
+	subl	-4(%rdi, %rdx), %esi
+
+	orl	%esi, %eax
+	ret
+# else
+	/* Combine comparisons for lo and hi 4-byte comparisons.  */
+	movl	-4(%rsi, %rdx), %ecx
+	movl	-4(%rdi, %rdx), %eax
+	shlq	$32, %rcx
+	shlq	$32, %rax
+	movl	(%rsi), %esi
+	movl	(%rdi), %edi
+	orq	%rsi, %rcx
+	orq	%rdi, %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+	ret
+# endif
+
+	.p2align 4,, 10
+L(cmp_9_16):
+# ifdef USE_AS_MEMCMPEQ
+	movq	(%rsi), %rax
+	subq	(%rdi), %rax
+
+	movq	-8(%rsi, %rdx), %rcx
+	subq	-8(%rdi, %rdx), %rcx
+	orq	%rcx, %rax
+	/* Convert 64 bit -> 32 bit boolean (we should have made the ABI
+	   return long).  */
+	setnz	%cl
+	movzbl	%cl, %eax
+# else
+	movq	(%rsi), %rcx
+	movq	(%rdi), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+
+	movq	-8(%rsi, %rdx, CHAR_SIZE), %rcx
+	movq	-8(%rdi, %rdx, CHAR_SIZE), %rax
+	/* Only compute proper return if not-equal.  */
+	cmpq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	xorl	%eax, %eax
+# endif
 #endif
-L(s4b):
-	testq	$4, %r10
-	jz	L(s8b)
-	movl	(%rdi),	%eax
-	movl	(%rdi, %rsi), %edx
-	subq    $4, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(finz1)
+	ret
+
+	.p2align 4,, 8
+L(cmp_0_1):
+	/* Flag set by earlier comparison against 1.  */
+	jne	L(cmp_0_0)
+#ifdef USE_AS_WMEMCMP
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(cmp_0_0)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
 #else
-	je	L(fin2_7)
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
 #endif
-	addq	$4, %rdi
-	cmpl	%edx, %eax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+	/* Fits in aligning bytes.  */
+L(cmp_0_0):
+	xorl	%eax, %eax
+	ret
+
+#ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	ret
+#else
+
+# ifndef USE_AS_MEMCMPEQ
+	.p2align 4,, 14
+L(ret_nonzero):
+	/* Need to bswap to get proper return without branch.  */
+	bswapq	%rcx
+	bswapq	%rax
+	subq	%rcx, %rax
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+# endif
+
+	.p2align 4
+L(cmp_0_3):
+# ifdef USE_AS_MEMCMPEQ
+	/* No reason to add to dependency chain on rdx. Saving a the
+	   bytes here doesn't change number of fetch blocks.  */
+	cmpl	$1, %edx
+	jbe	L(cmp_0_1)
+# else
+	/* We need the code size to prevent taking an extra fetch block.
+	 */
+	decl	%edx
+	jle	L(cmp_0_1)
+# endif
+	movzwl	(%rsi), %ecx
+	movzwl	(%rdi), %eax
+
+# ifdef USE_AS_MEMCMPEQ
+	subl	%ecx, %eax
+
+	movzbl	-1(%rsi, %rdx), %esi
+	movzbl	-1(%rdi, %rdx), %edi
+	subl	%edi, %esi
+	orl	%esi, %eax
+# else
+	bswapl	%ecx
+	bswapl	%eax
+
+	/* Implicit right shift by one. We just need to displace the
+	   sign bits.  */
+	shrl	%ecx
+	shrl	%eax
+
+	/* Eat a partial register stall here. Saves code stopping
+	   L(cmp_0_3) from bleeding into the next fetch block and saves
+	   an ALU.  */
+	movb	(%rsi, %rdx), %cl
+	movzbl	(%rdi, %rdx), %edi
+	orl	%edi, %eax
+	subl	%ecx, %eax
+# endif
+	ret
+#endif
+
+	.p2align 5
+L(more_1x_vec):
+#ifndef USE_AS_WMEMCMP
+	/* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
+	   in ecx for code size. This is preferable to using `incw` as
+	   it avoids partial register stalls on older hardware (pre
+	   SnB).  */
+	movl	$0xffff, %ecx
+#endif
+	movups	(%rsi), %xmm0
+	movups	(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_0)
+#if SIZE_OFFSET == 0
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
 #else
-	jnz	L(fin2_7)
+	/* Offset rdx. Saves just enough code size to keep the
+	   L(last_2x_vec) case and the non-zero return in a single
+	   cache line.  */
+	subq	$(CHAR_PER_VEC * 2), %rdx
 #endif
-L(s8b):
-	testq	$8, %r10
-	jz	L(s16b)
-	movq	(%rdi),	%rax
-	movq	(%rdi, %rsi), %rdx
-	subq    $8, %r10
-#ifdef USE_AS_MEMCMPEQ
-	je	L(sub_return8)
+	ja	L(more_2x_vec)
+
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+#ifndef USE_AS_MEMCMPEQ
+	/* Don't use `incw ax` as machines this code runs on are liable
+	   to have partial register stall.  */
+	jnz	L(ret_nonzero_vec_end_0)
 #else
-	je	L(fin2_7)
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_1):
+L(ret_nonzero_vec_start_0):
+L(ret_nonzero_vec_end_0):
 #endif
-	addq	$8, %rdi
-	cmpq	%rdx, %rax
-#ifdef USE_AS_MEMCMPEQ
-	jnz	L(neq_early)
+	ret
+
+#ifndef USE_AS_MEMCMPEQ
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(ret_nonzero_vec_end_0_adj):
+	addl	$3, %edx
+# else
+	.p2align 4,, 8
+# endif
+L(ret_nonzero_vec_end_0):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+# ifndef USE_AS_WMEMCMP
+	.p2align 4,, 10
+L(ret_nonzero_vec_start_0):
+	bsfl	%eax, %eax
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+	ret
+# endif
 #else
-	jnz	L(fin2_7)
 #endif
-L(s16b):
-	movdqu    (%rdi), %xmm1
-	movdqu    (%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
+
+	.p2align 5
+L(more_2x_vec):
+	movups	(VEC_SIZE * 1)(%rsi), %xmm0
+	movups	(VEC_SIZE * 1)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_vec_start_1)
+
+	cmpq	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
+	jbe	L(last_2x_vec)
+
+	cmpq	$(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
+	ja	L(more_8x_vec)
+
+	/* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
+	   This can harm performance if non-zero return in [65, 80] or
+	   [97, 112] but helps performance otherwise. Generally zero-
+	   return is hotter.  */
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jnz	L(ret_nonzero_vec_start_2_3)
+
+	cmpl	$(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm0
+	movups	(VEC_SIZE * 4)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 5)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
 #ifdef USE_AS_MEMCMPEQ
-	pmovmskb  %xmm1, %eax
-	subl      $0xffff, %eax
+	jz	L(last_2x_vec)
 	ret
 #else
-	pmovmskb  %xmm1, %edx
-	xorl	  %eax, %eax
-	subl      $0xffff, %edx
-	jz	  L(finz)
-	bsfl      %edx, %ecx
-	leaq	 (%rdi, %rcx), %rcx
-	movzbl	 (%rcx), %eax
-	movzbl	 (%rsi, %rcx), %edx
-	jmp	 L(finz1)
+	jnz	L(ret_nonzero_vec_start_4_5)
 #endif
-	.p2align 4,, 4
-L(finr1b):
-	movzbl	(%rdi), %eax
-	movzbl  (%rsi), %edx
-L(finz1):
-	subl	%edx, %eax
-L(exit):
-	ret
+	.p2align 4
+L(last_2x_vec):
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
+	movups	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
+	movups	(VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
 #ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(sub_return8):
-	subq	%rdx, %rax
-	movl	%eax, %edx
-	shrq	$32, %rax
-	orl	%edx, %eax
+	/* Various return targets for memcmpeq. Will always be hot in
+	   Icache and get short encoding.  */
+L(ret_nonzero_vec_start_2_3):
+L(ret_nonzero_vec_start_4_5):
 	ret
 #else
-	.p2align 4,, 4
-L(fin2_7):
-	cmpq	%rdx, %rax
-	jz	L(finz)
-	movq	%rax, %r11
-	subq	%rdx, %r11
-	bsfq	%r11, %rcx
-	sarq	$3, %rcx
-	salq	$3, %rcx
-	sarq	%cl, %rax
-	movzbl  %al, %eax
-	sarq	%cl, %rdx
-	movzbl  %dl, %edx
-	subl	%edx, %eax
+	jnz	L(ret_nonzero_vec_end_1)
 	ret
-#endif
-	.p2align 4,, 4
-L(finz):
-	xorl	%eax, %eax
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_end_1):
+	pmovmskb %xmm1, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+	/* Partial register stall.  */
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leal	(%rax, %rdx, CHAR_SIZE), %eax
+	movl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	addl	%edx, %eax
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
-#ifdef USE_AS_MEMCMPEQ
-	.p2align 4,, 4
-L(neq_early):
-	movl	$1, %eax
+
+	.p2align 4
+L(ret_nonzero_vec_start_4_5):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 4)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 4)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 4)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4,, 8
+L(ret_nonzero_vec_start_1):
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 1)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 1)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 1)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
 	ret
 #endif
-	/* For blocks bigger than 32 bytes
-	   1. Advance one of the addr pointer to be 16B aligned.
-	   2. Treat the case of both addr pointers aligned to 16B
-	      separately to avoid movdqu.
-	   3. Handle any blocks of greater than 64 consecutive bytes with
-	      unrolling to reduce branches.
-	   4. At least one addr pointer is 16B aligned, use memory version
-	      of pcmbeqb.
-	*/
-	.p2align 4,, 4
-L(gt32):
-	movq	%rdx, %r11
-	addq	%rdi, %r11
-	movq	%rdi, %r8
-
-	andq	$15, %r8
-	jz	L(16am)
-	/* Both pointers may be misaligned.  */
-	movdqu	(%rdi),	%xmm1
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   %xmm0, %xmm1
-	pmovmskb  %xmm1, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	neg	 %r8
-	leaq    16(%rdi, %r8), %rdi
-L(16am):
-	/* Handle two 16B aligned pointers separately.  */
-	testq   $15, %rsi
-	jz      L(ATR)
-	testq	$16, %rdi
-	jz	L(A32)
-	movdqu	(%rdi, %rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq	$16, %rdi
-L(A32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	/* Pre-unroll to be ready for unrolled 64B loop.  */
-	testq	$32, %rdi
-	jz	L(A64)
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(A64):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt32)
-
-L(A64main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A64main)
-
-L(mt32):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(A32main):
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqu    (%rdi,%rsi), %xmm0
-	pcmpeqb  (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq       %rdi, %r10
-	jne       L(A32main)
-L(mt16):
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-
-	.p2align 4,, 4
-L(neq):
-#ifdef USE_AS_MEMCMPEQ
-	movl	$1, %eax
-    ret
-#else
-	bsfl      %edx, %ecx
-	movzbl	 (%rdi, %rcx), %eax
-	addq	 %rdi, %rsi
-	movzbl	 (%rsi,%rcx), %edx
-	jmp	 L(finz1)
+
+	.p2align 4
+L(more_8x_vec):
+	subq	%rdi, %rsi
+	leaq	(VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
+	andq	$(VEC_SIZE * -1), %rdi
+	addq	%rdi, %rsi
+	.p2align 4
+L(loop_4x):
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 3)(%rsi), %xmm1
+
+	PCMPEQ	(VEC_SIZE * 2)(%rdi), %xmm0
+	PCMPEQ	(VEC_SIZE * 3)(%rdi), %xmm1
+
+	movups	(VEC_SIZE * 4)(%rsi), %xmm2
+	movups	(VEC_SIZE * 5)(%rsi), %xmm3
+
+	PCMPEQ	(VEC_SIZE * 4)(%rdi), %xmm2
+	PCMPEQ	(VEC_SIZE * 5)(%rdi), %xmm3
+
+	pand	%xmm0, %xmm1
+	pand	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
+
+	pmovmskb %xmm3, %eax
+	subl	%ecx, %eax
+	jnz	L(ret_nonzero_loop)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+	cmpq	%rdi, %rdx
+	ja	L(loop_4x)
+	/* Get remaining length in edx.  */
+	subl	%edi, %edx
+	/* Restore offset so we can reuse L(last_2x_vec).  */
+	addl	$(VEC_SIZE * 6 - SIZE_OFFSET), %edx
+#ifdef USE_AS_WMEMCMP
+	shrl	$2, %edx
 #endif
+	cmpl	$(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
+	jbe	L(last_2x_vec)
+
 
-	.p2align 4,, 4
-L(ATR):
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-	testq	$16, %rdi
-	jz	L(ATR32)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	je       L(mt16)
-
-L(ATR32):
-	movq	%r11, %r10
-	andq	$-64, %r10
-	testq	$32, %rdi
-	jz	L(ATR64)
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-L(ATR64):
-	cmpq       %rdi, %r10
-	je	   L(mt32)
-
-L(ATR64main):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-	cmpq       %rdi, %r10
-	jne       L(ATR64main)
-
-	movq	%r11, %r10
-	andq	$-32, %r10
-	cmpq	%r10, %rdi
-        jae	L(mt16)
-
-L(ATR32res):
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	movdqa    (%rdi,%rsi), %xmm0
-	pcmpeqb   (%rdi), %xmm0
-	pmovmskb  %xmm0, %edx
-	subl      $0xffff, %edx
-	jnz       L(neq)
-	addq       $16, %rdi
-
-	cmpq	  %r10, %rdi
-	jne       L(ATR32res)
-
-	subq       %rdi, %r11
-	je	  L(finz)
-	movq	  %r11, %r10
-	jmp	  L(small)
-	/* Align to 16byte to improve instruction fetch.  */
-	.p2align 4,, 4
-END(memcmp)
+	movups	(VEC_SIZE * 2)(%rsi), %xmm0
+	movups	(VEC_SIZE * 2)(%rdi), %xmm1
+	PCMPEQ	%xmm0, %xmm1
+	movups	(VEC_SIZE * 3)(%rsi), %xmm2
+	movups	(VEC_SIZE * 3)(%rdi), %xmm3
+	PCMPEQ	%xmm2, %xmm3
+	pand	%xmm1, %xmm3
 
+	pmovmskb %xmm3, %eax
+	CHECK_CMP (%ecx, %eax)
+	jz	L(last_2x_vec)
 #ifdef USE_AS_MEMCMPEQ
-libc_hidden_def (memcmp)
+L(ret_nonzero_loop):
+	ret
 #else
-# undef bcmp
-weak_alias (memcmp, bcmp)
-libc_hidden_builtin_def (memcmp)
+
+	.p2align 4
+L(ret_nonzero_vec_start_2_3):
+	pmovmskb %xmm1, %edx
+	sall	$16, %eax
+	leal	1(%rax, %rdx), %eax
+
+	bsfl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(ret_nonzero_loop):
+	pmovmskb %xmm0, %ecx
+	pmovmskb %xmm1, %edx
+	sall	$(VEC_SIZE * 1), %edx
+	leal	1(%rcx, %rdx), %edx
+	pmovmskb %xmm2, %ecx
+	/* High 16 bits of eax guranteed to be all ones. Rotate them in
+	   to we can do `or + not` with just `xor`.  */
+	rorl	$16, %eax
+	xorl	%ecx, %eax
+
+	salq	$32, %rax
+	orq	%rdx, %rax
+
+	bsfq	%rax, %rax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+#endif
+END(MEMCMP)
+
+#ifndef USE_AS_WMEMCMP
+# ifdef USE_AS_MEMCMPEQ
+libc_hidden_def (MEMCMP)
+# else
+#  undef bcmp
+weak_alias (MEMCMP, bcmp)
+libc_hidden_builtin_def (MEMCMP)
+# endif
 #endif
diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
index 2cee881fed..80c5e912a6 100644
--- a/sysdeps/x86_64/memcmpeq.S
+++ b/sysdeps/x86_64/memcmpeq.S
@@ -16,6 +16,6 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#define memcmp	__memcmpeq
+#define MEMCMP	__memcmpeq
 #define USE_AS_MEMCMPEQ	1
 #include "multiarch/memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index e7ea963fc0..b573966966 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -162,8 +162,8 @@ sysdep_routines += \
   wmemchr-sse2 \
   wmemcmp-avx2-movbe \
   wmemcmp-avx2-movbe-rtm \
-  wmemcmp-c \
   wmemcmp-evex-movbe \
+  wmemcmp-sse2 \
   wmemcmp-sse4 \
 # sysdep_routines
 endif
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
index e10555638d..4080fc1875 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
@@ -17,8 +17,8 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# ifndef memcmp
-#  define memcmp __memcmp_sse2
+# ifndef MEMCMP
+#  define MEMCMP __memcmp_sse2
 # endif
 
 # ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
index de7f5a7525..9d991e5c74 100644
--- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
+++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
@@ -17,9 +17,9 @@
    <https://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
-# define memcmp	__memcmpeq_sse2
+# define MEMCMP	__memcmpeq_sse2
 #else
-# define memcmp	__memcmpeq
+# define MEMCMP	__memcmpeq
 #endif
 #define USE_AS_MEMCMPEQ	1
 #include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
deleted file mode 100644
index 46b6715e18..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
+++ /dev/null
@@ -1,9 +0,0 @@
-#if IS_IN (libc)
-# include <wchar.h>
-
-# define WMEMCMP  __wmemcmp_sse2
-
-extern __typeof (wmemcmp) __wmemcmp_sse2;
-#endif
-
-#include "wcsmbs/wmemcmp.c"
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
new file mode 100644
index 0000000000..57be1c446e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
@@ -0,0 +1,25 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+# define MEMCMP	__wmemcmp_sse2
+#else
+# define MEMCMP	wmemcmp
+#endif
+#define USE_AS_WMEMCMP	1
+#include "memcmp-sse2.S"
diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
new file mode 100644
index 0000000000..032f389158
--- /dev/null
+++ b/sysdeps/x86_64/wmemcmp.S
@@ -0,0 +1,21 @@
+/* wmemcmp optimized with SSE2.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#define MEMCMP	wmemcmp
+#define USE_AS_WMEMCMP	1
+#include "multiarch/memcmp-sse2.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S
  2022-04-10  0:42   ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> New code save size (-303 bytes) and has significantly better
> performance.
>
> geometric_mean(N=20) of page cross cases New / Original: 0.634
> ---
>  sysdeps/x86_64/memcmp.S                  | 884 ++++++++++++++---------
>  sysdeps/x86_64/memcmpeq.S                |   2 +-
>  sysdeps/x86_64/multiarch/Makefile        |   2 +-
>  sysdeps/x86_64/multiarch/memcmp-sse2.S   |   4 +-
>  sysdeps/x86_64/multiarch/memcmpeq-sse2.S |   4 +-
>  sysdeps/x86_64/multiarch/wmemcmp-c.c     |   9 -
>  sysdeps/x86_64/multiarch/wmemcmp-sse2.S  |  25 +
>  sysdeps/x86_64/wmemcmp.S                 |  21 +
>  8 files changed, 575 insertions(+), 376 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-c.c
>  create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-sse2.S
>  create mode 100644 sysdeps/x86_64/wmemcmp.S
>
> diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
> index e02a53ea1e..b153694048 100644
> --- a/sysdeps/x86_64/memcmp.S
> +++ b/sysdeps/x86_64/memcmp.S
> @@ -18,395 +18,557 @@
>
>  #include <sysdep.h>
>
> -       .text
> -ENTRY (memcmp)
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       movl    %edx, %edx
> +#ifdef USE_AS_WMEMCMP
> +# define PCMPEQ        pcmpeqd
> +# define CHAR_SIZE     4
> +# define SIZE_OFFSET   (0)
> +#else
> +# define PCMPEQ        pcmpeqb
> +# define CHAR_SIZE     1
>  #endif
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(finz)
> -       cmpq    $1, %rdx
> -       jbe     L(finr1b)
> -       subq    %rdi, %rsi
> -       movq    %rdx, %r10
> -       cmpq    $32, %r10
> -       jae     L(gt32)
> -       /* Handle small chunks and last block of less than 32 bytes.  */
> -L(small):
> -       testq   $1, %r10
> -       jz      L(s2b)
> -       movzbl  (%rdi), %eax
> -       movzbl  (%rdi, %rsi), %edx
> -       subq    $1, %r10
> -       je      L(finz1)
> -       addq    $1, %rdi
> -       subl    %edx, %eax
> -       jnz     L(exit)
> -L(s2b):
> -       testq   $2, %r10
> -       jz      L(s4b)
> -       movzwl  (%rdi), %eax
> -       movzwl  (%rdi, %rsi), %edx
> -       subq    $2, %r10
> +
>  #ifdef USE_AS_MEMCMPEQ
> -       je      L(finz1)
> +# define SIZE_OFFSET   (0)
> +# define CHECK_CMP(x, y)       subl x, y
>  #else
> -       je      L(fin2_7)
> +# ifndef SIZE_OFFSET
> +#  define SIZE_OFFSET  (CHAR_PER_VEC * 2)
> +# endif
> +# define CHECK_CMP(x, y)       cmpl x, y
>  #endif
> -       addq    $2, %rdi
> -       cmpl    %edx, %eax
> -#ifdef USE_AS_MEMCMPEQ
> -       jnz     L(neq_early)
> +
> +#define VEC_SIZE       16
> +#define CHAR_PER_VEC   (VEC_SIZE / CHAR_SIZE)
> +
> +#ifndef MEMCMP
> +# define MEMCMP        memcmp
> +#endif
> +
> +       .text
> +ENTRY(MEMCMP)
> +#ifdef USE_AS_WMEMCMP
> +       /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
> +          in ecx for code size. This is preferable to using `incw` as
> +          it avoids partial register stalls on older hardware (pre
> +          SnB).  */
> +       movl    $0xffff, %ecx
> +#endif
> +       cmpq    $CHAR_PER_VEC, %rdx
> +       ja      L(more_1x_vec)
> +
> +#ifdef USE_AS_WMEMCMP
> +       /* saves a byte of code keeping the fall through path n = [2, 4]
> +          in the initial cache line.  */
> +       decl    %edx
> +       jle     L(cmp_0_1)
> +
> +       movq    (%rsi), %xmm0
> +       movq    (%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       subl    %ecx, %eax
> +       jnz     L(ret_nonzero_vec_start_0)
> +
> +       movq    -4(%rsi, %rdx, CHAR_SIZE), %xmm0
> +       movq    -4(%rdi, %rdx, CHAR_SIZE), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       subl    %ecx, %eax
> +       jnz     L(ret_nonzero_vec_end_0_adj)
>  #else
> -       jnz     L(fin2_7)
> +       cmpl    $8, %edx
> +       ja      L(cmp_9_16)
> +
> +       cmpl    $4, %edx
> +       jb      L(cmp_0_3)
> +
> +# ifdef USE_AS_MEMCMPEQ
> +       movl    (%rsi), %eax
> +       subl    (%rdi), %eax
> +
> +       movl    -4(%rsi, %rdx), %esi
> +       subl    -4(%rdi, %rdx), %esi
> +
> +       orl     %esi, %eax
> +       ret
> +# else
> +       /* Combine comparisons for lo and hi 4-byte comparisons.  */
> +       movl    -4(%rsi, %rdx), %ecx
> +       movl    -4(%rdi, %rdx), %eax
> +       shlq    $32, %rcx
> +       shlq    $32, %rax
> +       movl    (%rsi), %esi
> +       movl    (%rdi), %edi
> +       orq     %rsi, %rcx
> +       orq     %rdi, %rax
> +       /* Only compute proper return if not-equal.  */
> +       cmpq    %rcx, %rax
> +       jnz     L(ret_nonzero)
> +       xorl    %eax, %eax
> +       ret
> +# endif
> +
> +       .p2align 4,, 10
> +L(cmp_9_16):
> +# ifdef USE_AS_MEMCMPEQ
> +       movq    (%rsi), %rax
> +       subq    (%rdi), %rax
> +
> +       movq    -8(%rsi, %rdx), %rcx
> +       subq    -8(%rdi, %rdx), %rcx
> +       orq     %rcx, %rax
> +       /* Convert 64 bit -> 32 bit boolean (we should have made the ABI
> +          return long).  */
> +       setnz   %cl
> +       movzbl  %cl, %eax
> +# else
> +       movq    (%rsi), %rcx
> +       movq    (%rdi), %rax
> +       /* Only compute proper return if not-equal.  */
> +       cmpq    %rcx, %rax
> +       jnz     L(ret_nonzero)
> +
> +       movq    -8(%rsi, %rdx, CHAR_SIZE), %rcx
> +       movq    -8(%rdi, %rdx, CHAR_SIZE), %rax
> +       /* Only compute proper return if not-equal.  */
> +       cmpq    %rcx, %rax
> +       jnz     L(ret_nonzero)
> +       xorl    %eax, %eax
> +# endif
>  #endif
> -L(s4b):
> -       testq   $4, %r10
> -       jz      L(s8b)
> -       movl    (%rdi), %eax
> -       movl    (%rdi, %rsi), %edx
> -       subq    $4, %r10
> -#ifdef USE_AS_MEMCMPEQ
> -       je      L(finz1)
> +       ret
> +
> +       .p2align 4,, 8
> +L(cmp_0_1):
> +       /* Flag set by earlier comparison against 1.  */
> +       jne     L(cmp_0_0)
> +#ifdef USE_AS_WMEMCMP
> +       movl    (%rdi), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (%rsi), %ecx
> +       je      L(cmp_0_0)
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
>  #else
> -       je      L(fin2_7)
> +       movzbl  (%rdi), %eax
> +       movzbl  (%rsi), %ecx
> +       subl    %ecx, %eax
>  #endif
> -       addq    $4, %rdi
> -       cmpl    %edx, %eax
> -#ifdef USE_AS_MEMCMPEQ
> -       jnz     L(neq_early)
> +       ret
> +
> +       /* Fits in aligning bytes.  */
> +L(cmp_0_0):
> +       xorl    %eax, %eax
> +       ret
> +
> +#ifdef USE_AS_WMEMCMP
> +       .p2align 4
> +L(ret_nonzero_vec_start_0):
> +       bsfl    %eax, %eax
> +       movl    (%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +       ret
> +#else
> +
> +# ifndef USE_AS_MEMCMPEQ
> +       .p2align 4,, 14
> +L(ret_nonzero):
> +       /* Need to bswap to get proper return without branch.  */
> +       bswapq  %rcx
> +       bswapq  %rax
> +       subq    %rcx, %rax
> +       sbbl    %eax, %eax
> +       orl     $1, %eax
> +       ret
> +# endif
> +
> +       .p2align 4
> +L(cmp_0_3):
> +# ifdef USE_AS_MEMCMPEQ
> +       /* No reason to add to dependency chain on rdx. Saving a the
> +          bytes here doesn't change number of fetch blocks.  */
> +       cmpl    $1, %edx
> +       jbe     L(cmp_0_1)
> +# else
> +       /* We need the code size to prevent taking an extra fetch block.
> +        */
> +       decl    %edx
> +       jle     L(cmp_0_1)
> +# endif
> +       movzwl  (%rsi), %ecx
> +       movzwl  (%rdi), %eax
> +
> +# ifdef USE_AS_MEMCMPEQ
> +       subl    %ecx, %eax
> +
> +       movzbl  -1(%rsi, %rdx), %esi
> +       movzbl  -1(%rdi, %rdx), %edi
> +       subl    %edi, %esi
> +       orl     %esi, %eax
> +# else
> +       bswapl  %ecx
> +       bswapl  %eax
> +
> +       /* Implicit right shift by one. We just need to displace the
> +          sign bits.  */
> +       shrl    %ecx
> +       shrl    %eax
> +
> +       /* Eat a partial register stall here. Saves code stopping
> +          L(cmp_0_3) from bleeding into the next fetch block and saves
> +          an ALU.  */
> +       movb    (%rsi, %rdx), %cl
> +       movzbl  (%rdi, %rdx), %edi
> +       orl     %edi, %eax
> +       subl    %ecx, %eax
> +# endif
> +       ret
> +#endif
> +
> +       .p2align 5
> +L(more_1x_vec):
> +#ifndef USE_AS_WMEMCMP
> +       /* Use 0xffff to test for mismatches on pmovmskb bitmask. Store
> +          in ecx for code size. This is preferable to using `incw` as
> +          it avoids partial register stalls on older hardware (pre
> +          SnB).  */
> +       movl    $0xffff, %ecx
> +#endif
> +       movups  (%rsi), %xmm0
> +       movups  (%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       subl    %ecx, %eax
> +       jnz     L(ret_nonzero_vec_start_0)
> +#if SIZE_OFFSET == 0
> +       cmpq    $(CHAR_PER_VEC * 2), %rdx
>  #else
> -       jnz     L(fin2_7)
> +       /* Offset rdx. Saves just enough code size to keep the
> +          L(last_2x_vec) case and the non-zero return in a single
> +          cache line.  */
> +       subq    $(CHAR_PER_VEC * 2), %rdx
>  #endif
> -L(s8b):
> -       testq   $8, %r10
> -       jz      L(s16b)
> -       movq    (%rdi), %rax
> -       movq    (%rdi, %rsi), %rdx
> -       subq    $8, %r10
> -#ifdef USE_AS_MEMCMPEQ
> -       je      L(sub_return8)
> +       ja      L(more_2x_vec)
> +
> +       movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
> +       movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       subl    %ecx, %eax
> +#ifndef USE_AS_MEMCMPEQ
> +       /* Don't use `incw ax` as machines this code runs on are liable
> +          to have partial register stall.  */
> +       jnz     L(ret_nonzero_vec_end_0)
>  #else
> -       je      L(fin2_7)
> +       /* Various return targets for memcmpeq. Will always be hot in
> +          Icache and get short encoding.  */
> +L(ret_nonzero_vec_start_1):
> +L(ret_nonzero_vec_start_0):
> +L(ret_nonzero_vec_end_0):
>  #endif
> -       addq    $8, %rdi
> -       cmpq    %rdx, %rax
> -#ifdef USE_AS_MEMCMPEQ
> -       jnz     L(neq_early)
> +       ret
> +
> +#ifndef USE_AS_MEMCMPEQ
> +# ifdef USE_AS_WMEMCMP
> +       .p2align 4
> +L(ret_nonzero_vec_end_0_adj):
> +       addl    $3, %edx
> +# else
> +       .p2align 4,, 8
> +# endif
> +L(ret_nonzero_vec_end_0):
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> +       leal    (%rax, %rdx, CHAR_SIZE), %eax
> +       movl    (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       addl    %edx, %eax
> +       movzbl  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
> +       ret
> +# ifndef USE_AS_WMEMCMP
> +       .p2align 4,, 10
> +L(ret_nonzero_vec_start_0):
> +       bsfl    %eax, %eax
> +       movzbl  (%rsi, %rax), %ecx
> +       movzbl  (%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +       ret
> +# endif
>  #else
> -       jnz     L(fin2_7)
>  #endif
> -L(s16b):
> -       movdqu    (%rdi), %xmm1
> -       movdqu    (%rdi, %rsi), %xmm0
> -       pcmpeqb   %xmm0, %xmm1
> +
> +       .p2align 5
> +L(more_2x_vec):
> +       movups  (VEC_SIZE * 1)(%rsi), %xmm0
> +       movups  (VEC_SIZE * 1)(%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       subl    %ecx, %eax
> +       jnz     L(ret_nonzero_vec_start_1)
> +
> +       cmpq    $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %rdx
> +       jbe     L(last_2x_vec)
> +
> +       cmpq    $(CHAR_PER_VEC * 8 - SIZE_OFFSET), %rdx
> +       ja      L(more_8x_vec)
> +
> +       /* Do comparisons for [65, 96] and [97, 128] 2x VEC at a time.
> +          This can harm performance if non-zero return in [65, 80] or
> +          [97, 112] but helps performance otherwise. Generally zero-
> +          return is hotter.  */
> +       movups  (VEC_SIZE * 2)(%rsi), %xmm0
> +       movups  (VEC_SIZE * 2)(%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       movups  (VEC_SIZE * 3)(%rsi), %xmm2
> +       movups  (VEC_SIZE * 3)(%rdi), %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pand    %xmm1, %xmm3
> +
> +       pmovmskb %xmm3, %eax
> +       CHECK_CMP (%ecx, %eax)
> +       jnz     L(ret_nonzero_vec_start_2_3)
> +
> +       cmpl    $(CHAR_PER_VEC * 6 - SIZE_OFFSET), %edx
> +       jbe     L(last_2x_vec)
> +
> +       movups  (VEC_SIZE * 4)(%rsi), %xmm0
> +       movups  (VEC_SIZE * 4)(%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       movups  (VEC_SIZE * 5)(%rsi), %xmm2
> +       movups  (VEC_SIZE * 5)(%rdi), %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pand    %xmm1, %xmm3
> +
> +       pmovmskb %xmm3, %eax
> +       CHECK_CMP (%ecx, %eax)
>  #ifdef USE_AS_MEMCMPEQ
> -       pmovmskb  %xmm1, %eax
> -       subl      $0xffff, %eax
> +       jz      L(last_2x_vec)
>         ret
>  #else
> -       pmovmskb  %xmm1, %edx
> -       xorl      %eax, %eax
> -       subl      $0xffff, %edx
> -       jz        L(finz)
> -       bsfl      %edx, %ecx
> -       leaq     (%rdi, %rcx), %rcx
> -       movzbl   (%rcx), %eax
> -       movzbl   (%rsi, %rcx), %edx
> -       jmp      L(finz1)
> +       jnz     L(ret_nonzero_vec_start_4_5)
>  #endif
> -       .p2align 4,, 4
> -L(finr1b):
> -       movzbl  (%rdi), %eax
> -       movzbl  (%rsi), %edx
> -L(finz1):
> -       subl    %edx, %eax
> -L(exit):
> -       ret
> +       .p2align 4
> +L(last_2x_vec):
> +       movups  (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm0
> +       movups  (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rsi, %rdx, CHAR_SIZE), %xmm2
> +       movups  (VEC_SIZE * -1 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pand    %xmm1, %xmm3
> +       pmovmskb %xmm3, %eax
> +       subl    %ecx, %eax
>  #ifdef USE_AS_MEMCMPEQ
> -       .p2align 4,, 4
> -L(sub_return8):
> -       subq    %rdx, %rax
> -       movl    %eax, %edx
> -       shrq    $32, %rax
> -       orl     %edx, %eax
> +       /* Various return targets for memcmpeq. Will always be hot in
> +          Icache and get short encoding.  */
> +L(ret_nonzero_vec_start_2_3):
> +L(ret_nonzero_vec_start_4_5):
>         ret
>  #else
> -       .p2align 4,, 4
> -L(fin2_7):
> -       cmpq    %rdx, %rax
> -       jz      L(finz)
> -       movq    %rax, %r11
> -       subq    %rdx, %r11
> -       bsfq    %r11, %rcx
> -       sarq    $3, %rcx
> -       salq    $3, %rcx
> -       sarq    %cl, %rax
> -       movzbl  %al, %eax
> -       sarq    %cl, %rdx
> -       movzbl  %dl, %edx
> -       subl    %edx, %eax
> +       jnz     L(ret_nonzero_vec_end_1)
>         ret
> -#endif
> -       .p2align 4,, 4
> -L(finz):
> -       xorl    %eax, %eax
> +
> +       .p2align 4,, 8
> +L(ret_nonzero_vec_end_1):
> +       pmovmskb %xmm1, %ecx
> +       /* High 16 bits of eax guranteed to be all ones. Rotate them in
> +          to we can do `or + not` with just `xor`.  */
> +       rorl    $16, %eax
> +       xorl    %ecx, %eax
> +       /* Partial register stall.  */
> +
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> +       leal    (%rax, %rdx, CHAR_SIZE), %eax
> +       movl    (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       addl    %edx, %eax
> +       movzbl  (VEC_SIZE * -2 + SIZE_OFFSET)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * -2 + SIZE_OFFSET)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
>         ret
> -#ifdef USE_AS_MEMCMPEQ
> -       .p2align 4,, 4
> -L(neq_early):
> -       movl    $1, %eax
> +
> +       .p2align 4
> +L(ret_nonzero_vec_start_4_5):
> +       pmovmskb %xmm1, %edx
> +       sall    $16, %eax
> +       leal    1(%rax, %rdx), %eax
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> +       movl    (VEC_SIZE * 4)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * 4)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       movzbl  (VEC_SIZE * 4)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * 4)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
> +       ret
> +
> +       .p2align 4,, 8
> +L(ret_nonzero_vec_start_1):
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> +       movl    (VEC_SIZE * 1)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * 1)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       movzbl  (VEC_SIZE * 1)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * 1)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
>         ret
>  #endif
> -       /* For blocks bigger than 32 bytes
> -          1. Advance one of the addr pointer to be 16B aligned.
> -          2. Treat the case of both addr pointers aligned to 16B
> -             separately to avoid movdqu.
> -          3. Handle any blocks of greater than 64 consecutive bytes with
> -             unrolling to reduce branches.
> -          4. At least one addr pointer is 16B aligned, use memory version
> -             of pcmbeqb.
> -       */
> -       .p2align 4,, 4
> -L(gt32):
> -       movq    %rdx, %r11
> -       addq    %rdi, %r11
> -       movq    %rdi, %r8
> -
> -       andq    $15, %r8
> -       jz      L(16am)
> -       /* Both pointers may be misaligned.  */
> -       movdqu  (%rdi), %xmm1
> -       movdqu  (%rdi, %rsi), %xmm0
> -       pcmpeqb   %xmm0, %xmm1
> -       pmovmskb  %xmm1, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       neg      %r8
> -       leaq    16(%rdi, %r8), %rdi
> -L(16am):
> -       /* Handle two 16B aligned pointers separately.  */
> -       testq   $15, %rsi
> -       jz      L(ATR)
> -       testq   $16, %rdi
> -       jz      L(A32)
> -       movdqu  (%rdi, %rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq    $16, %rdi
> -L(A32):
> -       movq    %r11, %r10
> -       andq    $-32, %r10
> -       cmpq    %r10, %rdi
> -        jae    L(mt16)
> -       /* Pre-unroll to be ready for unrolled 64B loop.  */
> -       testq   $32, %rdi
> -       jz      L(A64)
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb  (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -L(A64):
> -       movq    %r11, %r10
> -       andq    $-64, %r10
> -       cmpq    %r10, %rdi
> -        jae    L(mt32)
> -
> -L(A64main):
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb  (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       cmpq       %rdi, %r10
> -       jne       L(A64main)
> -
> -L(mt32):
> -       movq    %r11, %r10
> -       andq    $-32, %r10
> -       cmpq    %r10, %rdi
> -        jae    L(mt16)
> -
> -L(A32main):
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqu    (%rdi,%rsi), %xmm0
> -       pcmpeqb  (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       cmpq       %rdi, %r10
> -       jne       L(A32main)
> -L(mt16):
> -       subq       %rdi, %r11
> -       je        L(finz)
> -       movq      %r11, %r10
> -       jmp       L(small)
> -
> -       .p2align 4,, 4
> -L(neq):
> -#ifdef USE_AS_MEMCMPEQ
> -       movl    $1, %eax
> -    ret
> -#else
> -       bsfl      %edx, %ecx
> -       movzbl   (%rdi, %rcx), %eax
> -       addq     %rdi, %rsi
> -       movzbl   (%rsi,%rcx), %edx
> -       jmp      L(finz1)
> +
> +       .p2align 4
> +L(more_8x_vec):
> +       subq    %rdi, %rsi
> +       leaq    (VEC_SIZE * -6 + SIZE_OFFSET)(%rdi, %rdx, CHAR_SIZE), %rdx
> +       andq    $(VEC_SIZE * -1), %rdi
> +       addq    %rdi, %rsi
> +       .p2align 4
> +L(loop_4x):
> +       movups  (VEC_SIZE * 2)(%rsi), %xmm0
> +       movups  (VEC_SIZE * 3)(%rsi), %xmm1
> +
> +       PCMPEQ  (VEC_SIZE * 2)(%rdi), %xmm0
> +       PCMPEQ  (VEC_SIZE * 3)(%rdi), %xmm1
> +
> +       movups  (VEC_SIZE * 4)(%rsi), %xmm2
> +       movups  (VEC_SIZE * 5)(%rsi), %xmm3
> +
> +       PCMPEQ  (VEC_SIZE * 4)(%rdi), %xmm2
> +       PCMPEQ  (VEC_SIZE * 5)(%rdi), %xmm3
> +
> +       pand    %xmm0, %xmm1
> +       pand    %xmm2, %xmm3
> +       pand    %xmm1, %xmm3
> +
> +       pmovmskb %xmm3, %eax
> +       subl    %ecx, %eax
> +       jnz     L(ret_nonzero_loop)
> +
> +       addq    $(VEC_SIZE * 4), %rdi
> +       addq    $(VEC_SIZE * 4), %rsi
> +       cmpq    %rdi, %rdx
> +       ja      L(loop_4x)
> +       /* Get remaining length in edx.  */
> +       subl    %edi, %edx
> +       /* Restore offset so we can reuse L(last_2x_vec).  */
> +       addl    $(VEC_SIZE * 6 - SIZE_OFFSET), %edx
> +#ifdef USE_AS_WMEMCMP
> +       shrl    $2, %edx
>  #endif
> +       cmpl    $(CHAR_PER_VEC * 4 - SIZE_OFFSET), %edx
> +       jbe     L(last_2x_vec)
> +
>
> -       .p2align 4,, 4
> -L(ATR):
> -       movq    %r11, %r10
> -       andq    $-32, %r10
> -       cmpq    %r10, %rdi
> -        jae    L(mt16)
> -       testq   $16, %rdi
> -       jz      L(ATR32)
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -       cmpq       %rdi, %r10
> -       je       L(mt16)
> -
> -L(ATR32):
> -       movq    %r11, %r10
> -       andq    $-64, %r10
> -       testq   $32, %rdi
> -       jz      L(ATR64)
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -L(ATR64):
> -       cmpq       %rdi, %r10
> -       je         L(mt32)
> -
> -L(ATR64main):
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -       cmpq       %rdi, %r10
> -       jne       L(ATR64main)
> -
> -       movq    %r11, %r10
> -       andq    $-32, %r10
> -       cmpq    %r10, %rdi
> -        jae    L(mt16)
> -
> -L(ATR32res):
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       movdqa    (%rdi,%rsi), %xmm0
> -       pcmpeqb   (%rdi), %xmm0
> -       pmovmskb  %xmm0, %edx
> -       subl      $0xffff, %edx
> -       jnz       L(neq)
> -       addq       $16, %rdi
> -
> -       cmpq      %r10, %rdi
> -       jne       L(ATR32res)
> -
> -       subq       %rdi, %r11
> -       je        L(finz)
> -       movq      %r11, %r10
> -       jmp       L(small)
> -       /* Align to 16byte to improve instruction fetch.  */
> -       .p2align 4,, 4
> -END(memcmp)
> +       movups  (VEC_SIZE * 2)(%rsi), %xmm0
> +       movups  (VEC_SIZE * 2)(%rdi), %xmm1
> +       PCMPEQ  %xmm0, %xmm1
> +       movups  (VEC_SIZE * 3)(%rsi), %xmm2
> +       movups  (VEC_SIZE * 3)(%rdi), %xmm3
> +       PCMPEQ  %xmm2, %xmm3
> +       pand    %xmm1, %xmm3
>
> +       pmovmskb %xmm3, %eax
> +       CHECK_CMP (%ecx, %eax)
> +       jz      L(last_2x_vec)
>  #ifdef USE_AS_MEMCMPEQ
> -libc_hidden_def (memcmp)
> +L(ret_nonzero_loop):
> +       ret
>  #else
> -# undef bcmp
> -weak_alias (memcmp, bcmp)
> -libc_hidden_builtin_def (memcmp)
> +
> +       .p2align 4
> +L(ret_nonzero_vec_start_2_3):
> +       pmovmskb %xmm1, %edx
> +       sall    $16, %eax
> +       leal    1(%rax, %rdx), %eax
> +
> +       bsfl    %eax, %eax
> +# ifdef USE_AS_WMEMCMP
> +       movl    (VEC_SIZE * 2)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * 2)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
> +       ret
> +
> +       .p2align 4
> +L(ret_nonzero_loop):
> +       pmovmskb %xmm0, %ecx
> +       pmovmskb %xmm1, %edx
> +       sall    $(VEC_SIZE * 1), %edx
> +       leal    1(%rcx, %rdx), %edx
> +       pmovmskb %xmm2, %ecx
> +       /* High 16 bits of eax guranteed to be all ones. Rotate them in
> +          to we can do `or + not` with just `xor`.  */
> +       rorl    $16, %eax
> +       xorl    %ecx, %eax
> +
> +       salq    $32, %rax
> +       orq     %rdx, %rax
> +
> +       bsfq    %rax, %rax
> +# ifdef USE_AS_WMEMCMP
> +       movl    (VEC_SIZE * 2)(%rdi, %rax), %ecx
> +       xorl    %edx, %edx
> +       cmpl    (VEC_SIZE * 2)(%rsi, %rax), %ecx
> +       /* NB: no partial register stall here because xorl zero idiom
> +          above.  */
> +       setg    %dl
> +       leal    -1(%rdx, %rdx), %eax
> +# else
> +       movzbl  (VEC_SIZE * 2)(%rsi, %rax), %ecx
> +       movzbl  (VEC_SIZE * 2)(%rdi, %rax), %eax
> +       subl    %ecx, %eax
> +# endif
> +       ret
> +#endif
> +END(MEMCMP)
> +
> +#ifndef USE_AS_WMEMCMP
> +# ifdef USE_AS_MEMCMPEQ
> +libc_hidden_def (MEMCMP)
> +# else
> +#  undef bcmp
> +weak_alias (MEMCMP, bcmp)
> +libc_hidden_builtin_def (MEMCMP)
> +# endif
>  #endif
> diff --git a/sysdeps/x86_64/memcmpeq.S b/sysdeps/x86_64/memcmpeq.S
> index 2cee881fed..80c5e912a6 100644
> --- a/sysdeps/x86_64/memcmpeq.S
> +++ b/sysdeps/x86_64/memcmpeq.S
> @@ -16,6 +16,6 @@
>     License along with the GNU C Library; if not, see
>     <https://www.gnu.org/licenses/>.  */
>
> -#define memcmp __memcmpeq
> +#define MEMCMP __memcmpeq
>  #define USE_AS_MEMCMPEQ        1
>  #include "multiarch/memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index e7ea963fc0..b573966966 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -162,8 +162,8 @@ sysdep_routines += \
>    wmemchr-sse2 \
>    wmemcmp-avx2-movbe \
>    wmemcmp-avx2-movbe-rtm \
> -  wmemcmp-c \
>    wmemcmp-evex-movbe \
> +  wmemcmp-sse2 \
>    wmemcmp-sse4 \
>  # sysdep_routines
>  endif
> diff --git a/sysdeps/x86_64/multiarch/memcmp-sse2.S b/sysdeps/x86_64/multiarch/memcmp-sse2.S
> index e10555638d..4080fc1875 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-sse2.S
> @@ -17,8 +17,8 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# ifndef memcmp
> -#  define memcmp __memcmp_sse2
> +# ifndef MEMCMP
> +#  define MEMCMP __memcmp_sse2
>  # endif
>
>  # ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> index de7f5a7525..9d991e5c74 100644
> --- a/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> +++ b/sysdeps/x86_64/multiarch/memcmpeq-sse2.S
> @@ -17,9 +17,9 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #if IS_IN (libc)
> -# define memcmp        __memcmpeq_sse2
> +# define MEMCMP        __memcmpeq_sse2
>  #else
> -# define memcmp        __memcmpeq
> +# define MEMCMP        __memcmpeq
>  #endif
>  #define USE_AS_MEMCMPEQ        1
>  #include "memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-c.c b/sysdeps/x86_64/multiarch/wmemcmp-c.c
> deleted file mode 100644
> index 46b6715e18..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-c.c
> +++ /dev/null
> @@ -1,9 +0,0 @@
> -#if IS_IN (libc)
> -# include <wchar.h>
> -
> -# define WMEMCMP  __wmemcmp_sse2
> -
> -extern __typeof (wmemcmp) __wmemcmp_sse2;
> -#endif
> -
> -#include "wcsmbs/wmemcmp.c"
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-sse2.S b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
> new file mode 100644
> index 0000000000..57be1c446e
> --- /dev/null
> +++ b/sysdeps/x86_64/multiarch/wmemcmp-sse2.S
> @@ -0,0 +1,25 @@
> +/* wmemcmp optimized with SSE2.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#if IS_IN (libc)
> +# define MEMCMP        __wmemcmp_sse2
> +#else
> +# define MEMCMP        wmemcmp
> +#endif
> +#define USE_AS_WMEMCMP 1
> +#include "memcmp-sse2.S"
> diff --git a/sysdeps/x86_64/wmemcmp.S b/sysdeps/x86_64/wmemcmp.S
> new file mode 100644
> index 0000000000..032f389158
> --- /dev/null
> +++ b/sysdeps/x86_64/wmemcmp.S
> @@ -0,0 +1,21 @@
> +/* wmemcmp optimized with SSE2.
> +   Copyright (C) 2022 Free Software Foundation, Inc.
> +   This file is part of the GNU C Library.
> +
> +   The GNU C Library is free software; you can redistribute it and/or
> +   modify it under the terms of the GNU Lesser General Public
> +   License as published by the Free Software Foundation; either
> +   version 2.1 of the License, or (at your option) any later version.
> +
> +   The GNU C Library is distributed in the hope that it will be useful,
> +   but WITHOUT ANY WARRANTY; without even the implied warranty of
> +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> +   Lesser General Public License for more details.
> +
> +   You should have received a copy of the GNU Lesser General Public
> +   License along with the GNU C Library; if not, see
> +   <https://www.gnu.org/licenses/>.  */
> +
> +#define MEMCMP wmemcmp
> +#define USE_AS_WMEMCMP 1
> +#include "multiarch/memcmp-sse2.S"
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 5/6] x86: Remove memcmp-sse4.S
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (5 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:42   ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
                     ` (2 subsequent siblings)
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

Code didn't actually use any sse4 instructions. The new memcmp-sse2
implementation is also faster.

geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905

Note there are two regressions prefering SSE2 for Size = 1 and Size =
65.

Size = 1:
size, align0, align1, ret, New Time/Old Time
   1,      1,      1,   0,               1.2
   1,      1,      1,   1,             1.197
   1,      1,      1,  -1,               1.2

This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).

Python3 Size = 1        -> 13.64%
Python3 Size = [4, 8]   -> 60.92%

GCC11   Size = 1        ->  1.29%
GCC11   Size = [4, 8]   -> 33.86%

size, align0, align1, ret, New Time/Old Time
   4,      4,      4,   0,             0.622
   4,      4,      4,   1,             0.797
   4,      4,      4,  -1,             0.805
   5,      5,      5,   0,             0.623
   5,      5,      5,   1,             0.777
   5,      5,      5,  -1,             0.802
   6,      6,      6,   0,             0.625
   6,      6,      6,   1,             0.813
   6,      6,      6,  -1,             0.788
   7,      7,      7,   0,             0.625
   7,      7,      7,   1,             0.799
   7,      7,      7,  -1,             0.795
   8,      8,      8,   0,             0.625
   8,      8,      8,   1,             0.848
   8,      8,      8,  -1,             0.914
   9,      9,      9,   0,             0.625

Size = 65:
size, align0, align1, ret, New Time/Old Time
  65,      0,      0,   0,             1.103
  65,      0,      0,   1,             1.216
  65,      0,      0,  -1,             1.227
  65,     65,      0,   0,             1.091
  65,      0,     65,   1,              1.19
  65,     65,     65,  -1,             1.215

This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.

size, align0, align1, ret, New Time/Old Time
 128,      4,      8,   0,             0.858
 128,      4,      8,   1,             0.879
 128,      4,      8,  -1,             0.888

As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
---
 sysdeps/x86_64/multiarch/Makefile          | 2 --
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ----
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    | 4 ----
 3 files changed, 10 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index b573966966..0400ea332b 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
   memcmp-avx2-movbe-rtm \
   memcmp-evex-movbe \
   memcmp-sse2 \
-  memcmp-sse4 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -164,7 +163,6 @@ sysdep_routines += \
   wmemcmp-avx2-movbe-rtm \
   wmemcmp-evex-movbe \
   wmemcmp-sse2 \
-  wmemcmp-sse4 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c6008a73ed..a8afcf81bb 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_evex_movbe)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
-			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 44759a3ad5..c743970fe3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -46,8 +45,5 @@ IFUNC_SELECTOR (void)
 	return OPTIMIZE (avx2_movbe);
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
-    return OPTIMIZE (sse4_1);
-
   return OPTIMIZE (sse2);
 }
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 5/6] x86: Remove memcmp-sse4.S
  2022-04-10  0:42   ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:46 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Code didn't actually use any sse4 instructions. The new memcmp-sse2
> implementation is also faster.
>
> geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
>
> Note there are two regressions prefering SSE2 for Size = 1 and Size =
> 65.
>
> Size = 1:
> size, align0, align1, ret, New Time/Old Time
>    1,      1,      1,   0,               1.2
>    1,      1,      1,   1,             1.197
>    1,      1,      1,  -1,               1.2
>
> This is intentional. Size == 1 is significantly less hot based on
> profiles of GCC11 and Python3 than sizes [4, 8] (which is made
> hotter).
>
> Python3 Size = 1        -> 13.64%
> Python3 Size = [4, 8]   -> 60.92%
>
> GCC11   Size = 1        ->  1.29%
> GCC11   Size = [4, 8]   -> 33.86%
>
> size, align0, align1, ret, New Time/Old Time
>    4,      4,      4,   0,             0.622
>    4,      4,      4,   1,             0.797
>    4,      4,      4,  -1,             0.805
>    5,      5,      5,   0,             0.623
>    5,      5,      5,   1,             0.777
>    5,      5,      5,  -1,             0.802
>    6,      6,      6,   0,             0.625
>    6,      6,      6,   1,             0.813
>    6,      6,      6,  -1,             0.788
>    7,      7,      7,   0,             0.625
>    7,      7,      7,   1,             0.799
>    7,      7,      7,  -1,             0.795
>    8,      8,      8,   0,             0.625
>    8,      8,      8,   1,             0.848
>    8,      8,      8,  -1,             0.914
>    9,      9,      9,   0,             0.625
>
> Size = 65:
> size, align0, align1, ret, New Time/Old Time
>   65,      0,      0,   0,             1.103
>   65,      0,      0,   1,             1.216
>   65,      0,      0,  -1,             1.227
>   65,     65,      0,   0,             1.091
>   65,      0,     65,   1,              1.19
>   65,     65,     65,  -1,             1.215
>
> This is because A) the checks in range [65, 96] are now unrolled 2x
> and B) because smaller values <= 16 are now given a hotter path. By
> contrast the SSE4 version has a branch for Size = 80. The unrolled
> version has get better performance for returns which need both
> comparisons.
>
> size, align0, align1, ret, New Time/Old Time
>  128,      4,      8,   0,             0.858
>  128,      4,      8,   1,             0.879
>  128,      4,      8,  -1,             0.888
>
> As well, out of microbenchmark environments that are not full
> predictable the branch will have a real-cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          | 2 --
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ----
>  sysdeps/x86_64/multiarch/ifunc-memcmp.h    | 4 ----
>  3 files changed, 10 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index b573966966..0400ea332b 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -11,7 +11,6 @@ sysdep_routines += \
>    memcmp-avx2-movbe-rtm \
>    memcmp-evex-movbe \
>    memcmp-sse2 \
> -  memcmp-sse4 \
>    memcmpeq-avx2 \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
> @@ -164,7 +163,6 @@ sysdep_routines += \
>    wmemcmp-avx2-movbe-rtm \
>    wmemcmp-evex-movbe \
>    wmemcmp-sse2 \
> -  wmemcmp-sse4 \
>  # sysdep_routines
>  endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index c6008a73ed..a8afcf81bb 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -96,8 +96,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (MOVBE)),
>                               __memcmp_evex_movbe)
> -             IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
> -                             __memcmp_sse4_1)
>               IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
>  #ifdef SHARED
> @@ -809,8 +807,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                                && CPU_FEATURE_USABLE (BMI2)
>                                && CPU_FEATURE_USABLE (MOVBE)),
>                               __wmemcmp_evex_movbe)
> -             IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
> -                             __wmemcmp_sse4_1)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index 44759a3ad5..c743970fe3 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
>  # include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
> @@ -46,8 +45,5 @@ IFUNC_SELECTOR (void)
>         return OPTIMIZE (avx2_movbe);
>      }
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
> -    return OPTIMIZE (sse4_1);
> -
>    return OPTIMIZE (sse2);
>  }
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (6 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
@ 2022-04-10  0:42   ` Noah Goldstein
  2022-04-10  0:48     ` Noah Goldstein
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  9 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:42 UTC (permalink / raw)
  To: libc-alpha

Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.

geometric_mean(N=20) of page cross cases New / Original: 0.960

size, align0, align1, ret, New Time/Old Time
   1,   4095,      0,   0,             1.001
   1,   4095,      0,   1,             0.999
   1,   4095,      0,  -1,               1.0
   2,   4094,      0,   0,               1.0
   2,   4094,      0,   1,               1.0
   2,   4094,      0,  -1,               1.0
   3,   4093,      0,   0,               1.0
   3,   4093,      0,   1,               1.0
   3,   4093,      0,  -1,               1.0
   4,   4092,      0,   0,             0.987
   4,   4092,      0,   1,               1.0
   4,   4092,      0,  -1,               1.0
   5,   4091,      0,   0,             0.984
   5,   4091,      0,   1,             1.002
   5,   4091,      0,  -1,             1.005
   6,   4090,      0,   0,             0.993
   6,   4090,      0,   1,             1.001
   6,   4090,      0,  -1,             1.003
   7,   4089,      0,   0,             0.991
   7,   4089,      0,   1,               1.0
   7,   4089,      0,  -1,             1.001
   8,   4088,      0,   0,             0.875
   8,   4088,      0,   1,             0.881
   8,   4088,      0,  -1,             0.888
   9,   4087,      0,   0,             0.872
   9,   4087,      0,   1,             0.879
   9,   4087,      0,  -1,             0.883
  10,   4086,      0,   0,             0.878
  10,   4086,      0,   1,             0.886
  10,   4086,      0,  -1,             0.873
  11,   4085,      0,   0,             0.878
  11,   4085,      0,   1,             0.881
  11,   4085,      0,  -1,             0.879
  12,   4084,      0,   0,             0.873
  12,   4084,      0,   1,             0.889
  12,   4084,      0,  -1,             0.875
  13,   4083,      0,   0,             0.873
  13,   4083,      0,   1,             0.863
  13,   4083,      0,  -1,             0.863
  14,   4082,      0,   0,             0.838
  14,   4082,      0,   1,             0.869
  14,   4082,      0,  -1,             0.877
  15,   4081,      0,   0,             0.841
  15,   4081,      0,   1,             0.869
  15,   4081,      0,  -1,             0.876
  16,   4080,      0,   0,             0.988
  16,   4080,      0,   1,              0.99
  16,   4080,      0,  -1,             0.989
  17,   4079,      0,   0,             0.978
  17,   4079,      0,   1,             0.981
  17,   4079,      0,  -1,              0.98
  18,   4078,      0,   0,             0.981
  18,   4078,      0,   1,              0.98
  18,   4078,      0,  -1,             0.985
  19,   4077,      0,   0,             0.977
  19,   4077,      0,   1,             0.979
  19,   4077,      0,  -1,             0.986
  20,   4076,      0,   0,             0.977
  20,   4076,      0,   1,             0.986
  20,   4076,      0,  -1,             0.984
  21,   4075,      0,   0,             0.977
  21,   4075,      0,   1,             0.983
  21,   4075,      0,  -1,             0.988
  22,   4074,      0,   0,             0.983
  22,   4074,      0,   1,             0.994
  22,   4074,      0,  -1,             0.993
  23,   4073,      0,   0,              0.98
  23,   4073,      0,   1,             0.992
  23,   4073,      0,  -1,             0.995
  24,   4072,      0,   0,             0.989
  24,   4072,      0,   1,             0.989
  24,   4072,      0,  -1,             0.991
  25,   4071,      0,   0,              0.99
  25,   4071,      0,   1,             0.999
  25,   4071,      0,  -1,             0.996
  26,   4070,      0,   0,             0.993
  26,   4070,      0,   1,             0.995
  26,   4070,      0,  -1,             0.998
  27,   4069,      0,   0,             0.993
  27,   4069,      0,   1,             0.999
  27,   4069,      0,  -1,               1.0
  28,   4068,      0,   0,             0.997
  28,   4068,      0,   1,               1.0
  28,   4068,      0,  -1,             0.999
  29,   4067,      0,   0,             0.996
  29,   4067,      0,   1,             0.999
  29,   4067,      0,  -1,             0.999
  30,   4066,      0,   0,             0.991
  30,   4066,      0,   1,             1.001
  30,   4066,      0,  -1,             0.999
  31,   4065,      0,   0,             0.988
  31,   4065,      0,   1,             0.998
  31,   4065,      0,  -1,             0.998
---
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
 1 file changed, 61 insertions(+), 37 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index a34ea1645d..210c9925b6 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
 # ifndef USE_AS_WMEMCMP
 	cmpl	$8, %edx
 	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
 	cmpl	$4, %edx
-	jae	L(between_4_7)
+	jb	L(between_2_3)
 
-	/* Load as big endian to avoid branches.  */
-	movzwl	(%rdi), %eax
-	movzwl	(%rsi), %ecx
-	shll	$8, %eax
-	shll	$8, %ecx
-	bswap	%eax
-	bswap	%ecx
-	movzbl	-1(%rdi, %rdx), %edi
-	movzbl	-1(%rsi, %rdx), %esi
-	orl	%edi, %eax
-	orl	%esi, %ecx
-	/* Subtraction is okay because the upper 8 bits are zero.  */
-	subl	%ecx, %eax
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
 	/* No ymm register was touched.  */
 	ret
 
@@ -457,9 +456,33 @@ L(one_or_less):
 	/* No ymm register was touched.  */
 	ret
 
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+	ret
+
 	.p2align 4
 L(between_8_15):
-# endif
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
+# else
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
 	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
+# endif
 
-	.p2align 4
-L(zero):
-	xorl	%eax, %eax
-	ret
-
-	.p2align 4
+	.p2align 4,, 10
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
 	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
 
 # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	ret
+
 	.p2align 4
 L(one_or_less):
 	jb	L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
 # else
 
 	.p2align 4
-L(between_4_7):
-	/* Load as big endian with overlapping movbe to avoid branches.
-	 */
-	movbe	(%rdi), %eax
-	movbe	(%rsi), %ecx
-	shlq	$32, %rax
-	shlq	$32, %rcx
-	movbe	-4(%rdi, %rdx), %edi
-	movbe	-4(%rsi, %rdx), %esi
-	orq	%rdi, %rax
-	orq	%rsi, %rcx
-	subq	%rcx, %rax
-	jz	L(zero_4_7)
-	sbbl	%eax, %eax
-	orl	$1, %eax
-L(zero_4_7):
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
 	/* No ymm register was touched.  */
 	ret
 # endif
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S
  2022-04-10  0:42   ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
@ 2022-04-10  0:48     ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:48 UTC (permalink / raw)
  To: GNU C Library

Disregard this patch. It's from the wrong patchset.

On Sat, Apr 9, 2022 at 7:47 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> Old code was both inefficient and wasted code size. New code (-62
> bytes) and comparable or better performance in the page cross case.
>
> geometric_mean(N=20) of page cross cases New / Original: 0.960
>
> size, align0, align1, ret, New Time/Old Time
>    1,   4095,      0,   0,             1.001
>    1,   4095,      0,   1,             0.999
>    1,   4095,      0,  -1,               1.0
>    2,   4094,      0,   0,               1.0
>    2,   4094,      0,   1,               1.0
>    2,   4094,      0,  -1,               1.0
>    3,   4093,      0,   0,               1.0
>    3,   4093,      0,   1,               1.0
>    3,   4093,      0,  -1,               1.0
>    4,   4092,      0,   0,             0.987
>    4,   4092,      0,   1,               1.0
>    4,   4092,      0,  -1,               1.0
>    5,   4091,      0,   0,             0.984
>    5,   4091,      0,   1,             1.002
>    5,   4091,      0,  -1,             1.005
>    6,   4090,      0,   0,             0.993
>    6,   4090,      0,   1,             1.001
>    6,   4090,      0,  -1,             1.003
>    7,   4089,      0,   0,             0.991
>    7,   4089,      0,   1,               1.0
>    7,   4089,      0,  -1,             1.001
>    8,   4088,      0,   0,             0.875
>    8,   4088,      0,   1,             0.881
>    8,   4088,      0,  -1,             0.888
>    9,   4087,      0,   0,             0.872
>    9,   4087,      0,   1,             0.879
>    9,   4087,      0,  -1,             0.883
>   10,   4086,      0,   0,             0.878
>   10,   4086,      0,   1,             0.886
>   10,   4086,      0,  -1,             0.873
>   11,   4085,      0,   0,             0.878
>   11,   4085,      0,   1,             0.881
>   11,   4085,      0,  -1,             0.879
>   12,   4084,      0,   0,             0.873
>   12,   4084,      0,   1,             0.889
>   12,   4084,      0,  -1,             0.875
>   13,   4083,      0,   0,             0.873
>   13,   4083,      0,   1,             0.863
>   13,   4083,      0,  -1,             0.863
>   14,   4082,      0,   0,             0.838
>   14,   4082,      0,   1,             0.869
>   14,   4082,      0,  -1,             0.877
>   15,   4081,      0,   0,             0.841
>   15,   4081,      0,   1,             0.869
>   15,   4081,      0,  -1,             0.876
>   16,   4080,      0,   0,             0.988
>   16,   4080,      0,   1,              0.99
>   16,   4080,      0,  -1,             0.989
>   17,   4079,      0,   0,             0.978
>   17,   4079,      0,   1,             0.981
>   17,   4079,      0,  -1,              0.98
>   18,   4078,      0,   0,             0.981
>   18,   4078,      0,   1,              0.98
>   18,   4078,      0,  -1,             0.985
>   19,   4077,      0,   0,             0.977
>   19,   4077,      0,   1,             0.979
>   19,   4077,      0,  -1,             0.986
>   20,   4076,      0,   0,             0.977
>   20,   4076,      0,   1,             0.986
>   20,   4076,      0,  -1,             0.984
>   21,   4075,      0,   0,             0.977
>   21,   4075,      0,   1,             0.983
>   21,   4075,      0,  -1,             0.988
>   22,   4074,      0,   0,             0.983
>   22,   4074,      0,   1,             0.994
>   22,   4074,      0,  -1,             0.993
>   23,   4073,      0,   0,              0.98
>   23,   4073,      0,   1,             0.992
>   23,   4073,      0,  -1,             0.995
>   24,   4072,      0,   0,             0.989
>   24,   4072,      0,   1,             0.989
>   24,   4072,      0,  -1,             0.991
>   25,   4071,      0,   0,              0.99
>   25,   4071,      0,   1,             0.999
>   25,   4071,      0,  -1,             0.996
>   26,   4070,      0,   0,             0.993
>   26,   4070,      0,   1,             0.995
>   26,   4070,      0,  -1,             0.998
>   27,   4069,      0,   0,             0.993
>   27,   4069,      0,   1,             0.999
>   27,   4069,      0,  -1,               1.0
>   28,   4068,      0,   0,             0.997
>   28,   4068,      0,   1,               1.0
>   28,   4068,      0,  -1,             0.999
>   29,   4067,      0,   0,             0.996
>   29,   4067,      0,   1,             0.999
>   29,   4067,      0,  -1,             0.999
>   30,   4066,      0,   0,             0.991
>   30,   4066,      0,   1,             1.001
>   30,   4066,      0,  -1,             0.999
>   31,   4065,      0,   0,             0.988
>   31,   4065,      0,   1,             0.998
>   31,   4065,      0,  -1,             0.998
> ---
>  sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
>  1 file changed, 61 insertions(+), 37 deletions(-)
>
> diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> index a34ea1645d..210c9925b6 100644
> --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
> @@ -429,22 +429,21 @@ L(page_cross_less_vec):
>  # ifndef USE_AS_WMEMCMP
>         cmpl    $8, %edx
>         jae     L(between_8_15)
> +       /* Fall through for [4, 7].  */
>         cmpl    $4, %edx
> -       jae     L(between_4_7)
> +       jb      L(between_2_3)
>
> -       /* Load as big endian to avoid branches.  */
> -       movzwl  (%rdi), %eax
> -       movzwl  (%rsi), %ecx
> -       shll    $8, %eax
> -       shll    $8, %ecx
> -       bswap   %eax
> -       bswap   %ecx
> -       movzbl  -1(%rdi, %rdx), %edi
> -       movzbl  -1(%rsi, %rdx), %esi
> -       orl     %edi, %eax
> -       orl     %esi, %ecx
> -       /* Subtraction is okay because the upper 8 bits are zero.  */
> -       subl    %ecx, %eax
> +       movbe   (%rdi), %eax
> +       movbe   (%rsi), %ecx
> +       shlq    $32, %rax
> +       shlq    $32, %rcx
> +       movbe   -4(%rdi, %rdx), %edi
> +       movbe   -4(%rsi, %rdx), %esi
> +       orq     %rdi, %rax
> +       orq     %rsi, %rcx
> +       subq    %rcx, %rax
> +       /* Fast path for return zero.  */
> +       jnz     L(ret_nonzero)
>         /* No ymm register was touched.  */
>         ret
>
> @@ -457,9 +456,33 @@ L(one_or_less):
>         /* No ymm register was touched.  */
>         ret
>
> +       .p2align 4,, 5
> +L(ret_nonzero):
> +       sbbl    %eax, %eax
> +       orl     $1, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +
> +       .p2align 4,, 2
> +L(zero):
> +       xorl    %eax, %eax
> +       /* No ymm register was touched.  */
> +       ret
> +
>         .p2align 4
>  L(between_8_15):
> -# endif
> +       movbe   (%rdi), %rax
> +       movbe   (%rsi), %rcx
> +       subq    %rcx, %rax
> +       jnz     L(ret_nonzero)
> +       movbe   -8(%rdi, %rdx), %rax
> +       movbe   -8(%rsi, %rdx), %rcx
> +       subq    %rcx, %rax
> +       /* Fast path for return zero.  */
> +       jnz     L(ret_nonzero)
> +       /* No ymm register was touched.  */
> +       ret
> +# else
>         /* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
>         vmovq   (%rdi), %xmm1
>         vmovq   (%rsi), %xmm2
> @@ -475,16 +498,13 @@ L(between_8_15):
>         VPCMPEQ %xmm1, %xmm2, %xmm2
>         vpmovmskb %xmm2, %eax
>         subl    $0xffff, %eax
> +       /* Fast path for return zero.  */
>         jnz     L(return_vec_0)
>         /* No ymm register was touched.  */
>         ret
> +# endif
>
> -       .p2align 4
> -L(zero):
> -       xorl    %eax, %eax
> -       ret
> -
> -       .p2align 4
> +       .p2align 4,, 10
>  L(between_16_31):
>         /* From 16 to 31 bytes.  No branch when size == 16.  */
>         vmovdqu (%rsi), %xmm2
> @@ -501,11 +521,17 @@ L(between_16_31):
>         VPCMPEQ (%rdi), %xmm2, %xmm2
>         vpmovmskb %xmm2, %eax
>         subl    $0xffff, %eax
> +       /* Fast path for return zero.  */
>         jnz     L(return_vec_0)
>         /* No ymm register was touched.  */
>         ret
>
>  # ifdef USE_AS_WMEMCMP
> +       .p2align 4,, 2
> +L(zero):
> +       xorl    %eax, %eax
> +       ret
> +
>         .p2align 4
>  L(one_or_less):
>         jb      L(zero)
> @@ -520,22 +546,20 @@ L(one_or_less):
>  # else
>
>         .p2align 4
> -L(between_4_7):
> -       /* Load as big endian with overlapping movbe to avoid branches.
> -        */
> -       movbe   (%rdi), %eax
> -       movbe   (%rsi), %ecx
> -       shlq    $32, %rax
> -       shlq    $32, %rcx
> -       movbe   -4(%rdi, %rdx), %edi
> -       movbe   -4(%rsi, %rdx), %esi
> -       orq     %rdi, %rax
> -       orq     %rsi, %rcx
> -       subq    %rcx, %rax
> -       jz      L(zero_4_7)
> -       sbbl    %eax, %eax
> -       orl     $1, %eax
> -L(zero_4_7):
> +L(between_2_3):
> +       /* Load as big endian to avoid branches.  */
> +       movzwl  (%rdi), %eax
> +       movzwl  (%rsi), %ecx
> +       bswap   %eax
> +       bswap   %ecx
> +       shrl    %eax
> +       shrl    %ecx
> +       movzbl  -1(%rdi, %rdx), %edi
> +       movzbl  -1(%rsi, %rdx), %esi
> +       orl     %edi, %eax
> +       orl     %esi, %ecx
> +       /* Subtraction is okay because the upper bit is zero.  */
> +       subl    %ecx, %eax
>         /* No ymm register was touched.  */
>         ret
>  # endif
> --
> 2.25.1
>

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (7 preceding siblings ...)
  2022-04-10  0:42   ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
@ 2022-04-10  0:54   ` Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                       ` (3 more replies)
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  9 siblings, 4 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:54 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
 sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
 5 files changed, 2006 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
   memcmp-evex-movbe \
   memcmp-sse2 \
   memcmp-sse4 \
-  memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
   wmemcmp-c \
   wmemcmp-evex-movbe \
   wmemcmp-sse4 \
-  wmemcmp-ssse3 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_ssse3
-# endif
-
-/* Warning!
-	   wmemcmp has to use SIGNED comparison for elements.
-	   memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-	test	%RDX_LP, %RDX_LP
-	jz	L(equal)
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	mov	%rdx, %rcx
-	mov	%rdi, %rdx
-	cmp	$48, %rcx;
-	jae	L(48bytesormore)	/* LEN => 48  */
-
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-/* ECX >= 32.  */
-L(48bytesormore):
-	movdqu	(%rdi), %xmm3
-	movdqu	(%rsi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%rdi), %rdi
-	lea	16(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%rdx, %rdi
-	sub	%rdx, %rsi
-	add	%rdx, %rcx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
-	cmp	$8, %edx
-	jae	L(next_unaligned_table)
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$1, %edx
-	je	L(shr_1)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$3, %edx
-	je	L(shr_3)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$5, %edx
-	je	L(shr_5)
-	cmp	$6, %edx
-	je	L(shr_6)
-	jmp	L(shr_7)
-
-	.p2align 2
-L(next_unaligned_table):
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$9, %edx
-	je	L(shr_9)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$11, %edx
-	je	L(shr_11)
-	cmp	$12, %edx
-	je	L(shr_12)
-	cmp	$13, %edx
-	je	L(shr_13)
-	cmp	$14, %edx
-	je	L(shr_14)
-	jmp	L(shr_15)
-# else
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$8, %edx
-	je	L(shr_8)
-	jmp	L(shr_12)
-# endif
-
-	.p2align 4
-L(shr_0):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	jae	L(shr_0_gobble)
-	xor	%eax, %eax
-	movdqa	(%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_0_gobble):
-	movdqa	(%rsi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%rdi), %xmm0
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %rcx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%rsi), %xmm0
-	movdqa	48(%rsi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%rdi), %xmm0
-	pcmpeqb	48(%rdi), %xmm2
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %rcx
-	jge	L(next)
-	inc	%edx
-	add	$32, %rcx
-L(next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_1):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_1_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$1, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$1, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_1_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$1, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$1, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$1, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_1_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_1_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_1_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	1(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-
-	.p2align 4
-L(shr_2):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$2, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_2_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$2, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$2, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$2, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	2(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_3_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$3, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$3, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$3, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$3, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$3, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_3_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_3_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_3_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	3(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_4):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$4, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_4_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$4, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$4, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$4, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	4(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_5):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_5_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$5, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$5, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_5_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$5, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$5, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$5, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_5_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_5_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_5_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	5(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$6, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$6, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$6, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$6, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	6(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_7_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$7, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$7, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$7, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$7, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$7, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_7_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_7_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_7_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	7(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_8):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$8, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_8_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$8, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$8, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$8, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	8(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_9):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_9_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$9, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$9, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_9_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$9, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$9, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$9, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_9_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_9_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_9_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	9(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$10, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$10, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$10, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$10, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	10(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_11_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$11, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$11, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$11, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$11, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$11, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_11_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_11_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_11_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	11(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_12):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$12, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_12_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$12, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$12, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$12, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	12(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_13):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_13_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$13, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$13, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_13_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$13, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$13, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$13, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_13_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_13_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_13_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	13(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$14, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$14, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$14, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$14, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	14(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_15_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$15, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$15, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$15, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$15, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$15, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_15_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_15_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_15_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	15(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-# endif
-	.p2align 4
-L(exit):
-	pmovmskb %xmm1, %r8d
-	sub	$0xffff, %r8d
-	jz	L(first16bytes)
-	lea	-16(%rsi), %rsi
-	lea	-16(%rdi), %rdi
-	mov	%r8d, %edx
-L(first16bytes):
-	add	%rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
-	test	%dl, %dl
-	jz	L(next_24_bytes)
-
-	test	$0x01, %dl
-	jnz	L(Byte16)
-
-	test	$0x02, %dl
-	jnz	L(Byte17)
-
-	test	$0x04, %dl
-	jnz	L(Byte18)
-
-	test	$0x08, %dl
-	jnz	L(Byte19)
-
-	test	$0x10, %dl
-	jnz	L(Byte20)
-
-	test	$0x20, %dl
-	jnz	L(Byte21)
-
-	test	$0x40, %dl
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte16):
-	movzbl	-16(%rdi), %eax
-	movzbl	-16(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte17):
-	movzbl	-15(%rdi), %eax
-	movzbl	-15(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte18):
-	movzbl	-14(%rdi), %eax
-	movzbl	-14(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte19):
-	movzbl	-13(%rdi), %eax
-	movzbl	-13(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte20):
-	movzbl	-12(%rdi), %eax
-	movzbl	-12(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte21):
-	movzbl	-11(%rdi), %eax
-	movzbl	-11(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte22):
-	movzbl	-10(%rdi), %eax
-	movzbl	-10(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(next_24_bytes):
-	lea	8(%rdi), %rdi
-	lea	8(%rsi), %rsi
-	test	$0x01, %dh
-	jnz	L(Byte16)
-
-	test	$0x02, %dh
-	jnz	L(Byte17)
-
-	test	$0x04, %dh
-	jnz	L(Byte18)
-
-	test	$0x08, %dh
-	jnz	L(Byte19)
-
-	test	$0x10, %dh
-	jnz	L(Byte20)
-
-	test	$0x20, %dh
-	jnz	L(Byte21)
-
-	test	$0x40, %dh
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-# else
-/* special for wmemcmp */
-	xor	%eax, %eax
-	test	%dl, %dl
-	jz	L(next_two_double_words)
-	and	$15, %dl
-	jz	L(second_double_word)
-	mov	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(second_double_word):
-	mov	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(next_two_double_words):
-	and	$15, %dh
-	jz	L(fourth_double_word)
-	mov	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(fourth_double_word):
-	mov	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-	ret
-# endif
-
-	.p2align 4
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$0, %ecx
-	je	L(0bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %ecx
-	je	L(1bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-# else
-	jmp	L(4bytes)
-# endif
-
-	.p2align 4
-L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$9, %ecx
-	je	L(9bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$11, %ecx
-	je	L(11bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	cmp	$13, %ecx
-	je	L(13bytes)
-	cmp	$14, %ecx
-	je	L(14bytes)
-	jmp	L(15bytes)
-# else
-	jmp	L(12bytes)
-# endif
-
-	.p2align 4
-L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$17, %ecx
-	je	L(17bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$19, %ecx
-	je	L(19bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	cmp	$21, %ecx
-	je	L(21bytes)
-	cmp	$22, %ecx
-	je	L(22bytes)
-	jmp	L(23bytes)
-# else
-	jmp	L(20bytes)
-# endif
-
-	.p2align 4
-L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$25, %ecx
-	je	L(25bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$27, %ecx
-	je	L(27bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	cmp	$29, %ecx
-	je	L(29bytes)
-	cmp	$30, %ecx
-	je	L(30bytes)
-	jmp	L(31bytes)
-# else
-	jmp	L(28bytes)
-# endif
-
-	.p2align 4
-L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$33, %ecx
-	je	L(33bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$35, %ecx
-	je	L(35bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	cmp	$37, %ecx
-	je	L(37bytes)
-	cmp	$38, %ecx
-	je	L(38bytes)
-	jmp	L(39bytes)
-# else
-	jmp	L(36bytes)
-# endif
-
-	.p2align 4
-L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$41, %ecx
-	je	L(41bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$43, %ecx
-	je	L(43bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	cmp	$45, %ecx
-	je	L(45bytes)
-	cmp	$46, %ecx
-	je	L(46bytes)
-	jmp	L(47bytes)
-
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	movl	-44(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	movl	-40(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	movl	-36(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	movl	-32(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	movl	-28(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	movl	-24(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	movl	-20(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	movl	-16(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	movl	-12(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	movl	-8(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	movl	-4(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# else
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	cmp	-44(%rsi), %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	cmp	-40(%rsi), %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	cmp	-36(%rsi), %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	cmp	-32(%rsi), %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	cmp	-28(%rsi), %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	cmp	-24(%rsi), %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	cmp	-20(%rsi), %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(45bytes):
-	movl	-45(%rdi), %eax
-	movl	-45(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(41bytes):
-	movl	-41(%rdi), %eax
-	movl	-41(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(37bytes):
-	movl	-37(%rdi), %eax
-	movl	-37(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(33bytes):
-	movl	-33(%rdi), %eax
-	movl	-33(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(29bytes):
-	movl	-29(%rdi), %eax
-	movl	-29(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(25bytes):
-	movl	-25(%rdi), %eax
-	movl	-25(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(21bytes):
-	movl	-21(%rdi), %eax
-	movl	-21(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(17bytes):
-	movl	-17(%rdi), %eax
-	movl	-17(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(13bytes):
-	movl	-13(%rdi), %eax
-	movl	-13(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(9bytes):
-	movl	-9(%rdi), %eax
-	movl	-9(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(5bytes):
-	movl	-5(%rdi), %eax
-	movl	-5(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(46bytes):
-	movl	-46(%rdi), %eax
-	movl	-46(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(42bytes):
-	movl	-42(%rdi), %eax
-	movl	-42(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(38bytes):
-	movl	-38(%rdi), %eax
-	movl	-38(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(34bytes):
-	movl	-34(%rdi), %eax
-	movl	-34(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(30bytes):
-	movl	-30(%rdi), %eax
-	movl	-30(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(26bytes):
-	movl	-26(%rdi), %eax
-	movl	-26(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(22bytes):
-	movl	-22(%rdi), %eax
-	movl	-22(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(18bytes):
-	movl	-18(%rdi), %eax
-	movl	-18(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(14bytes):
-	movl	-14(%rdi), %eax
-	movl	-14(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(10bytes):
-	movl	-10(%rdi), %eax
-	movl	-10(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(6bytes):
-	movl	-6(%rdi), %eax
-	movl	-6(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(2bytes):
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(47bytes):
-	movl	-47(%rdi), %eax
-	movl	-47(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(43bytes):
-	movl	-43(%rdi), %eax
-	movl	-43(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(39bytes):
-	movl	-39(%rdi), %eax
-	movl	-39(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(35bytes):
-	movl	-35(%rdi), %eax
-	movl	-35(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(31bytes):
-	movl	-31(%rdi), %eax
-	movl	-31(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(27bytes):
-	movl	-27(%rdi), %eax
-	movl	-27(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(23bytes):
-	movl	-23(%rdi), %eax
-	movl	-23(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(19bytes):
-	movl	-19(%rdi), %eax
-	movl	-19(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(15bytes):
-	movl	-15(%rdi), %eax
-	movl	-15(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(11bytes):
-	movl	-11(%rdi), %eax
-	movl	-11(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(7bytes):
-	movl	-7(%rdi), %eax
-	movl	-7(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(find_diff):
-	cmpb	%cl, %al
-	jne	L(set)
-	cmpw	%cx, %ax
-	jne	L(set)
-	shr	$16, %eax
-	shr	$16, %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-
-/* We get there only if we already know there is a
-difference.  */
-
-	cmp	%ecx, %eax
-L(set):
-	sbb	%eax, %eax
-	sbb	$-1, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	.p2align 4
-L(find_diff):
-	mov	$1, %eax
-	jg	L(find_diff_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(find_diff_bigger):
-	ret
-# endif
-
-	.p2align 4
-L(equal):
-	xor	%eax, %eax
-	ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-10  0:54     ` Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
                       ` (2 subsequent siblings)
  3 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:54 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |   4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
 sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
 sysdeps/x86_64/multiarch/strcmp.c             |   4 -
 sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
 sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
 sysdeps/x86_64/multiarch/strncmp.c            |   4 -
 sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
 10 files changed, 30 insertions(+), 202 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
   strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
-  strcasecmp_l-ssse3 \
   strcat-avx2 \
   strcat-avx2-rtm \
   strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
-  strcmp-ssse3 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
   strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
-  strncase_l-ssse3 \
   strncat-avx2 \
   strncat-avx2-rtm \
   strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncmp-evex \
   strncmp-sse2 \
   strncmp-sse4_2 \
-  strncmp-ssse3 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
   strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 
 #ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
 # endif
 #endif
 
-#ifndef USE_SSSE3
 	.text
-#else
-	.section .text.ssse3,"ax",@progbits
-#endif
-
 #ifdef USE_AS_STRCASECMP_L
 # ifndef ENTRY2
 #  define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 3/6] x86: Remove str{n}cat-ssse3
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-10  0:54     ` Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
  3 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:54 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
 sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
 sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
 5 files changed, 879 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..2b3c625ea2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -63,7 +63,6 @@ sysdep_routines += \
   strcat-evex \
   strcat-sse2 \
   strcat-sse2-unaligned \
-  strcat-ssse3 \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
@@ -101,7 +100,6 @@ sysdep_routines += \
   strncat-c \
   strncat-evex \
   strncat-sse2-unaligned \
-  strncat-ssse3 \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..41a04621ad 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcat_evex)
-	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
-			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
-	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
-			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
-   implementation gets merged.  */
-
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
-L(StartStrcpyPart):
-	mov	%rsi, %rcx
-	lea	(%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(StrncatExit0)
-	cmp	$8, %r8
-	jbe	L(StrncatExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	jb	L(StrncatExit15Bytes)
-# endif
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	je	L(StrncatExit16)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit1):
-	xor	%ah, %ah
-	movb	%ah, 1(%rdx)
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit2):
-	xor	%ah, %ah
-	movb	%ah, 2(%rdx)
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit3):
-	xor	%ah, %ah
-	movb	%ah, 3(%rdx)
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit4):
-	xor	%ah, %ah
-	movb	%ah, 4(%rdx)
-L(Exit4):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit5):
-	xor	%ah, %ah
-	movb	%ah, 5(%rdx)
-L(Exit5):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit6):
-	xor	%ah, %ah
-	movb	%ah, 6(%rdx)
-L(Exit6):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit7):
-	xor	%ah, %ah
-	movb	%ah, 7(%rdx)
-L(Exit7):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	3(%rcx), %eax
-	mov	%eax, 3(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8):
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-L(Exit8):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit9):
-	xor	%ah, %ah
-	movb	%ah, 9(%rdx)
-L(Exit9):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movb	8(%rcx), %al
-	movb	%al, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit10):
-	xor	%ah, %ah
-	movb	%ah, 10(%rdx)
-L(Exit10):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movw	8(%rcx), %ax
-	movw	%ax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit11):
-	xor	%ah, %ah
-	movb	%ah, 11(%rdx)
-L(Exit11):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit12):
-	xor	%ah, %ah
-	movb	%ah, 12(%rdx)
-L(Exit12):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit13):
-	xor	%ah, %ah
-	movb	%ah, 13(%rdx)
-L(Exit13):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	5(%rcx), %xmm1
-	movlpd	%xmm1, 5(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit14):
-	xor	%ah, %ah
-	movb	%ah, 14(%rdx)
-L(Exit14):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	6(%rcx), %xmm1
-	movlpd	%xmm1, 6(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15):
-	xor	%ah, %ah
-	movb	%ah, 15(%rdx)
-L(Exit15):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit16):
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-L(Exit16):
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-# ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase2):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$8, %r8
-	ja	L(ExitHighCase3)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase3):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit0):
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15Bytes):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8Bytes):
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-10  0:54     ` Noah Goldstein
  2022-04-10  0:54     ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
  3 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:54 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-04-10  0:54     ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-10  0:54     ` Noah Goldstein
  3 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-10  0:54 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |   16 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 6 insertions(+), 3212 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
   memcmpeq-evex \
   memcmpeq-sse2 \
   memcpy-ssse3 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
   memmove-ssse3 \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      return OPTIMIZE (ssse3);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                     ` (8 preceding siblings ...)
  2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47   ` Noah Goldstein
  2022-04-14 16:47     ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
                       ` (5 more replies)
  9 siblings, 6 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
 sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
 sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
 5 files changed, 2006 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 6507d1b7fa..51222dfab1 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -12,7 +12,6 @@ sysdep_routines += \
   memcmp-evex-movbe \
   memcmp-sse2 \
   memcmp-sse4 \
-  memcmp-ssse3 \
   memcmpeq-avx2 \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
@@ -179,7 +178,6 @@ sysdep_routines += \
   wmemcmp-c \
   wmemcmp-evex-movbe \
   wmemcmp-sse4 \
-  wmemcmp-ssse3 \
 # sysdep_routines
 endif
 
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 40cc6cc49e..f389928a4e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __memcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
 
 #ifdef SHARED
@@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
-	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index cd12613699..44759a3ad5 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -20,7 +20,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
deleted file mode 100644
index df1b1fc494..0000000000
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ /dev/null
@@ -1,1992 +0,0 @@
-/* memcmp with SSSE3, wmemcmp with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-#  define MEMCMP	__memcmp_ssse3
-# endif
-
-/* Warning!
-	   wmemcmp has to use SIGNED comparison for elements.
-	   memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
-	atom_text_section
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
-	shl	$2, %RDX_LP
-	test	%RDX_LP, %RDX_LP
-	jz	L(equal)
-# elif defined __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-# endif
-	mov	%rdx, %rcx
-	mov	%rdi, %rdx
-	cmp	$48, %rcx;
-	jae	L(48bytesormore)	/* LEN => 48  */
-
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-/* ECX >= 32.  */
-L(48bytesormore):
-	movdqu	(%rdi), %xmm3
-	movdqu	(%rsi), %xmm0
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	16(%rdi), %rdi
-	lea	16(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(less16bytes)
-	mov	%edi, %edx
-	and	$0xf, %edx
-	xor	%rdx, %rdi
-	sub	%rdx, %rsi
-	add	%rdx, %rcx
-	mov	%esi, %edx
-	and	$0xf, %edx
-	jz	L(shr_0)
-	xor	%rdx, %rsi
-
-# ifndef USE_AS_WMEMCMP
-	cmp	$8, %edx
-	jae	L(next_unaligned_table)
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$1, %edx
-	je	L(shr_1)
-	cmp	$2, %edx
-	je	L(shr_2)
-	cmp	$3, %edx
-	je	L(shr_3)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$5, %edx
-	je	L(shr_5)
-	cmp	$6, %edx
-	je	L(shr_6)
-	jmp	L(shr_7)
-
-	.p2align 2
-L(next_unaligned_table):
-	cmp	$8, %edx
-	je	L(shr_8)
-	cmp	$9, %edx
-	je	L(shr_9)
-	cmp	$10, %edx
-	je	L(shr_10)
-	cmp	$11, %edx
-	je	L(shr_11)
-	cmp	$12, %edx
-	je	L(shr_12)
-	cmp	$13, %edx
-	je	L(shr_13)
-	cmp	$14, %edx
-	je	L(shr_14)
-	jmp	L(shr_15)
-# else
-	cmp	$0, %edx
-	je	L(shr_0)
-	cmp	$4, %edx
-	je	L(shr_4)
-	cmp	$8, %edx
-	je	L(shr_8)
-	jmp	L(shr_12)
-# endif
-
-	.p2align 4
-L(shr_0):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	jae	L(shr_0_gobble)
-	xor	%eax, %eax
-	movdqa	(%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-	pand	%xmm1, %xmm2
-	pmovmskb %xmm2, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_0_gobble):
-	movdqa	(%rsi), %xmm0
-	xor	%eax, %eax
-	pcmpeqb	(%rdi), %xmm0
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm2
-	pcmpeqb	16(%rdi), %xmm2
-L(shr_0_gobble_loop):
-	pand	%xmm0, %xmm2
-	sub	$32, %rcx
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	movdqa	32(%rsi), %xmm0
-	movdqa	48(%rsi), %xmm2
-	sbb	$0xffff, %edx
-	pcmpeqb	32(%rdi), %xmm0
-	pcmpeqb	48(%rdi), %xmm2
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	jz	L(shr_0_gobble_loop)
-
-	pand	%xmm0, %xmm2
-	cmp	$0, %rcx
-	jge	L(next)
-	inc	%edx
-	add	$32, %rcx
-L(next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm2, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_1):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_1_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$1, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$1, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_1_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$1, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$1, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_1_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$1, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$1, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_1_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_1_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_1_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	1(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-
-	.p2align 4
-L(shr_2):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_2_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$2, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$2, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_2_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$2, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$2, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_2_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$2, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$2, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_2_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_2_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_2_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	2(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_3_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$3, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$3, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_3_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$3, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$3, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_3_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$3, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$3, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_3_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_3_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_3_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	3(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_4):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_4_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$4, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$4, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_4_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$4, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$4, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_4_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$4, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$4, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_4_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_4_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_4_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	4(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_5):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_5_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$5, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$5, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_5_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$5, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$5, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_5_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$5, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$5, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_5_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_5_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_5_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	5(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_6_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$6, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$6, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_6_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$6, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$6, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_6_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$6, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$6, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_6_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_6_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_6_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	6(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_7_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$7, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$7, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_7_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$7, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$7, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_7_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$7, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$7, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_7_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_7_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_7_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	7(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_8):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_8_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$8, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$8, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_8_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$8, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$8, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_8_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$8, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$8, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_8_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_8_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_8_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	8(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_9):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_9_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$9, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$9, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_9_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$9, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$9, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_9_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$9, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$9, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_9_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_9_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_9_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	9(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_10_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$10, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$10, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_10_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$10, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$10, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_10_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$10, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$10, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_10_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_10_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_10_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	10(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_11_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$11, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$11, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_11_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$11, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$11, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_11_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$11, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$11, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_11_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_11_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_11_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	11(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# endif
-
-	.p2align 4
-L(shr_12):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_12_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$12, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$12, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_12_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$12, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$12, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_12_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$12, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$12, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_12_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_12_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_12_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	12(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-# ifndef USE_AS_WMEMCMP
-
-	.p2align 4
-L(shr_13):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_13_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$13, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$13, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_13_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$13, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$13, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_13_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$13, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$13, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_13_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_13_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_13_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	13(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_14_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$14, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$14, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_14_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$14, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$14, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_14_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$14, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$14, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_14_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_14_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_14_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	14(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15):
-	cmp	$80, %rcx
-	lea	-48(%rcx), %rcx
-	mov	%edx, %eax
-	jae	L(shr_15_gobble)
-
-	movdqa	16(%rsi), %xmm1
-	movdqa	%xmm1, %xmm2
-	palignr	$15, (%rsi), %xmm1
-	pcmpeqb	(%rdi), %xmm1
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, %xmm2, %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-	pand	%xmm1, %xmm3
-	pmovmskb %xmm3, %edx
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-	add	$15, %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-
-	.p2align 4
-L(shr_15_gobble):
-	sub	$32, %rcx
-	movdqa	16(%rsi), %xmm0
-	palignr	$15, (%rsi), %xmm0
-	pcmpeqb	(%rdi), %xmm0
-
-	movdqa	32(%rsi), %xmm3
-	palignr	$15, 16(%rsi), %xmm3
-	pcmpeqb	16(%rdi), %xmm3
-
-L(shr_15_gobble_loop):
-	pand	%xmm0, %xmm3
-	sub	$32, %rcx
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-
-	movdqa	64(%rsi), %xmm3
-	palignr	$15, 48(%rsi), %xmm3
-	sbb	$0xffff, %edx
-	movdqa	48(%rsi), %xmm0
-	palignr	$15, 32(%rsi), %xmm0
-	pcmpeqb	32(%rdi), %xmm0
-	lea	32(%rsi), %rsi
-	pcmpeqb	48(%rdi), %xmm3
-
-	lea	32(%rdi), %rdi
-	jz	L(shr_15_gobble_loop)
-	pand	%xmm0, %xmm3
-
-	cmp	$0, %rcx
-	jge	L(shr_15_gobble_next)
-	inc	%edx
-	add	$32, %rcx
-L(shr_15_gobble_next):
-	test	%edx, %edx
-	jnz	L(exit)
-
-	pmovmskb %xmm3, %edx
-	movdqa	%xmm0, %xmm1
-	lea	32(%rdi), %rdi
-	lea	32(%rsi), %rsi
-	sub	$0xffff, %edx
-	jnz	L(exit)
-
-	lea	15(%rsi), %rsi
-	add	%rcx, %rsi
-	add	%rcx, %rdi
-	jmp	L(less48bytes)
-# endif
-	.p2align 4
-L(exit):
-	pmovmskb %xmm1, %r8d
-	sub	$0xffff, %r8d
-	jz	L(first16bytes)
-	lea	-16(%rsi), %rsi
-	lea	-16(%rdi), %rdi
-	mov	%r8d, %edx
-L(first16bytes):
-	add	%rax, %rsi
-L(less16bytes):
-# ifndef USE_AS_WMEMCMP
-	test	%dl, %dl
-	jz	L(next_24_bytes)
-
-	test	$0x01, %dl
-	jnz	L(Byte16)
-
-	test	$0x02, %dl
-	jnz	L(Byte17)
-
-	test	$0x04, %dl
-	jnz	L(Byte18)
-
-	test	$0x08, %dl
-	jnz	L(Byte19)
-
-	test	$0x10, %dl
-	jnz	L(Byte20)
-
-	test	$0x20, %dl
-	jnz	L(Byte21)
-
-	test	$0x40, %dl
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte16):
-	movzbl	-16(%rdi), %eax
-	movzbl	-16(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte17):
-	movzbl	-15(%rdi), %eax
-	movzbl	-15(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte18):
-	movzbl	-14(%rdi), %eax
-	movzbl	-14(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte19):
-	movzbl	-13(%rdi), %eax
-	movzbl	-13(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte20):
-	movzbl	-12(%rdi), %eax
-	movzbl	-12(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte21):
-	movzbl	-11(%rdi), %eax
-	movzbl	-11(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(Byte22):
-	movzbl	-10(%rdi), %eax
-	movzbl	-10(%rsi), %edx
-	sub	%edx, %eax
-	ret
-
-	.p2align 4
-L(next_24_bytes):
-	lea	8(%rdi), %rdi
-	lea	8(%rsi), %rsi
-	test	$0x01, %dh
-	jnz	L(Byte16)
-
-	test	$0x02, %dh
-	jnz	L(Byte17)
-
-	test	$0x04, %dh
-	jnz	L(Byte18)
-
-	test	$0x08, %dh
-	jnz	L(Byte19)
-
-	test	$0x10, %dh
-	jnz	L(Byte20)
-
-	test	$0x20, %dh
-	jnz	L(Byte21)
-
-	test	$0x40, %dh
-	jnz	L(Byte22)
-
-	movzbl	-9(%rdi), %eax
-	movzbl	-9(%rsi), %edx
-	sub	%edx, %eax
-	ret
-# else
-/* special for wmemcmp */
-	xor	%eax, %eax
-	test	%dl, %dl
-	jz	L(next_two_double_words)
-	and	$15, %dl
-	jz	L(second_double_word)
-	mov	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(second_double_word):
-	mov	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(next_two_double_words):
-	and	$15, %dh
-	jz	L(fourth_double_word)
-	mov	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-	ret
-
-	.p2align 4
-L(fourth_double_word):
-	mov	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-	ret
-# endif
-
-	.p2align 4
-L(less48bytes):
-	cmp	$8, %ecx
-	jae	L(more8bytes)
-	cmp	$0, %ecx
-	je	L(0bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$1, %ecx
-	je	L(1bytes)
-	cmp	$2, %ecx
-	je	L(2bytes)
-	cmp	$3, %ecx
-	je	L(3bytes)
-	cmp	$4, %ecx
-	je	L(4bytes)
-	cmp	$5, %ecx
-	je	L(5bytes)
-	cmp	$6, %ecx
-	je	L(6bytes)
-	jmp	L(7bytes)
-# else
-	jmp	L(4bytes)
-# endif
-
-	.p2align 4
-L(more8bytes):
-	cmp	$16, %ecx
-	jae	L(more16bytes)
-	cmp	$8, %ecx
-	je	L(8bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$9, %ecx
-	je	L(9bytes)
-	cmp	$10, %ecx
-	je	L(10bytes)
-	cmp	$11, %ecx
-	je	L(11bytes)
-	cmp	$12, %ecx
-	je	L(12bytes)
-	cmp	$13, %ecx
-	je	L(13bytes)
-	cmp	$14, %ecx
-	je	L(14bytes)
-	jmp	L(15bytes)
-# else
-	jmp	L(12bytes)
-# endif
-
-	.p2align 4
-L(more16bytes):
-	cmp	$24, %ecx
-	jae	L(more24bytes)
-	cmp	$16, %ecx
-	je	L(16bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$17, %ecx
-	je	L(17bytes)
-	cmp	$18, %ecx
-	je	L(18bytes)
-	cmp	$19, %ecx
-	je	L(19bytes)
-	cmp	$20, %ecx
-	je	L(20bytes)
-	cmp	$21, %ecx
-	je	L(21bytes)
-	cmp	$22, %ecx
-	je	L(22bytes)
-	jmp	L(23bytes)
-# else
-	jmp	L(20bytes)
-# endif
-
-	.p2align 4
-L(more24bytes):
-	cmp	$32, %ecx
-	jae	L(more32bytes)
-	cmp	$24, %ecx
-	je	L(24bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$25, %ecx
-	je	L(25bytes)
-	cmp	$26, %ecx
-	je	L(26bytes)
-	cmp	$27, %ecx
-	je	L(27bytes)
-	cmp	$28, %ecx
-	je	L(28bytes)
-	cmp	$29, %ecx
-	je	L(29bytes)
-	cmp	$30, %ecx
-	je	L(30bytes)
-	jmp	L(31bytes)
-# else
-	jmp	L(28bytes)
-# endif
-
-	.p2align 4
-L(more32bytes):
-	cmp	$40, %ecx
-	jae	L(more40bytes)
-	cmp	$32, %ecx
-	je	L(32bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$33, %ecx
-	je	L(33bytes)
-	cmp	$34, %ecx
-	je	L(34bytes)
-	cmp	$35, %ecx
-	je	L(35bytes)
-	cmp	$36, %ecx
-	je	L(36bytes)
-	cmp	$37, %ecx
-	je	L(37bytes)
-	cmp	$38, %ecx
-	je	L(38bytes)
-	jmp	L(39bytes)
-# else
-	jmp	L(36bytes)
-# endif
-
-	.p2align 4
-L(more40bytes):
-	cmp	$40, %ecx
-	je	L(40bytes)
-# ifndef USE_AS_WMEMCMP
-	cmp	$41, %ecx
-	je	L(41bytes)
-	cmp	$42, %ecx
-	je	L(42bytes)
-	cmp	$43, %ecx
-	je	L(43bytes)
-	cmp	$44, %ecx
-	je	L(44bytes)
-	cmp	$45, %ecx
-	je	L(45bytes)
-	cmp	$46, %ecx
-	je	L(46bytes)
-	jmp	L(47bytes)
-
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	movl	-44(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	movl	-40(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	movl	-36(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	movl	-32(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	movl	-28(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	movl	-24(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	movl	-20(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	movl	-16(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	movl	-12(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	movl	-8(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	movl	-4(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# else
-	.p2align 4
-L(44bytes):
-	movl	-44(%rdi), %eax
-	cmp	-44(%rsi), %eax
-	jne	L(find_diff)
-L(40bytes):
-	movl	-40(%rdi), %eax
-	cmp	-40(%rsi), %eax
-	jne	L(find_diff)
-L(36bytes):
-	movl	-36(%rdi), %eax
-	cmp	-36(%rsi), %eax
-	jne	L(find_diff)
-L(32bytes):
-	movl	-32(%rdi), %eax
-	cmp	-32(%rsi), %eax
-	jne	L(find_diff)
-L(28bytes):
-	movl	-28(%rdi), %eax
-	cmp	-28(%rsi), %eax
-	jne	L(find_diff)
-L(24bytes):
-	movl	-24(%rdi), %eax
-	cmp	-24(%rsi), %eax
-	jne	L(find_diff)
-L(20bytes):
-	movl	-20(%rdi), %eax
-	cmp	-20(%rsi), %eax
-	jne	L(find_diff)
-L(16bytes):
-	movl	-16(%rdi), %eax
-	cmp	-16(%rsi), %eax
-	jne	L(find_diff)
-L(12bytes):
-	movl	-12(%rdi), %eax
-	cmp	-12(%rsi), %eax
-	jne	L(find_diff)
-L(8bytes):
-	movl	-8(%rdi), %eax
-	cmp	-8(%rsi), %eax
-	jne	L(find_diff)
-L(4bytes):
-	movl	-4(%rdi), %eax
-	cmp	-4(%rsi), %eax
-	jne	L(find_diff)
-L(0bytes):
-	xor	%eax, %eax
-	ret
-# endif
-
-# ifndef USE_AS_WMEMCMP
-	.p2align 4
-L(45bytes):
-	movl	-45(%rdi), %eax
-	movl	-45(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(41bytes):
-	movl	-41(%rdi), %eax
-	movl	-41(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(37bytes):
-	movl	-37(%rdi), %eax
-	movl	-37(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(33bytes):
-	movl	-33(%rdi), %eax
-	movl	-33(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(29bytes):
-	movl	-29(%rdi), %eax
-	movl	-29(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(25bytes):
-	movl	-25(%rdi), %eax
-	movl	-25(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(21bytes):
-	movl	-21(%rdi), %eax
-	movl	-21(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(17bytes):
-	movl	-17(%rdi), %eax
-	movl	-17(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(13bytes):
-	movl	-13(%rdi), %eax
-	movl	-13(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(9bytes):
-	movl	-9(%rdi), %eax
-	movl	-9(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(5bytes):
-	movl	-5(%rdi), %eax
-	movl	-5(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(1bytes):
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(46bytes):
-	movl	-46(%rdi), %eax
-	movl	-46(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(42bytes):
-	movl	-42(%rdi), %eax
-	movl	-42(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(38bytes):
-	movl	-38(%rdi), %eax
-	movl	-38(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(34bytes):
-	movl	-34(%rdi), %eax
-	movl	-34(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(30bytes):
-	movl	-30(%rdi), %eax
-	movl	-30(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(26bytes):
-	movl	-26(%rdi), %eax
-	movl	-26(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(22bytes):
-	movl	-22(%rdi), %eax
-	movl	-22(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(18bytes):
-	movl	-18(%rdi), %eax
-	movl	-18(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(14bytes):
-	movl	-14(%rdi), %eax
-	movl	-14(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(10bytes):
-	movl	-10(%rdi), %eax
-	movl	-10(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(6bytes):
-	movl	-6(%rdi), %eax
-	movl	-6(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(2bytes):
-	movzwl	-2(%rdi), %eax
-	movzwl	-2(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(47bytes):
-	movl	-47(%rdi), %eax
-	movl	-47(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(43bytes):
-	movl	-43(%rdi), %eax
-	movl	-43(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(39bytes):
-	movl	-39(%rdi), %eax
-	movl	-39(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(35bytes):
-	movl	-35(%rdi), %eax
-	movl	-35(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(31bytes):
-	movl	-31(%rdi), %eax
-	movl	-31(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(27bytes):
-	movl	-27(%rdi), %eax
-	movl	-27(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(23bytes):
-	movl	-23(%rdi), %eax
-	movl	-23(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(19bytes):
-	movl	-19(%rdi), %eax
-	movl	-19(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(15bytes):
-	movl	-15(%rdi), %eax
-	movl	-15(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(11bytes):
-	movl	-11(%rdi), %eax
-	movl	-11(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(7bytes):
-	movl	-7(%rdi), %eax
-	movl	-7(%rsi), %ecx
-	cmp	%ecx, %eax
-	jne	L(find_diff)
-L(3bytes):
-	movzwl	-3(%rdi), %eax
-	movzwl	-3(%rsi), %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-	cmp	%ecx, %eax
-	jne	L(set)
-	movzbl	-1(%rdi), %eax
-	cmpb	-1(%rsi), %al
-	jne	L(set)
-	xor	%eax, %eax
-	ret
-
-	.p2align 4
-L(find_diff):
-	cmpb	%cl, %al
-	jne	L(set)
-	cmpw	%cx, %ax
-	jne	L(set)
-	shr	$16, %eax
-	shr	$16, %ecx
-	cmpb	%cl, %al
-	jne	L(set)
-
-/* We get there only if we already know there is a
-difference.  */
-
-	cmp	%ecx, %eax
-L(set):
-	sbb	%eax, %eax
-	sbb	$-1, %eax
-	ret
-# else
-
-/* for wmemcmp */
-	.p2align 4
-L(find_diff):
-	mov	$1, %eax
-	jg	L(find_diff_bigger)
-	neg	%eax
-	ret
-
-	.p2align 4
-L(find_diff_bigger):
-	ret
-# endif
-
-	.p2align 4
-L(equal):
-	xor	%eax, %eax
-	ret
-
-END (MEMCMP)
-#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
deleted file mode 100644
index a41ef95fc1..0000000000
--- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_WMEMCMP 1
-#define MEMCMP __wmemcmp_ssse3
-
-#include "memcmp-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47     ` Noah Goldstein
  2022-04-14 18:05       ` H.J. Lu
  2022-04-14 16:47     ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
                       ` (4 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |   4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
 sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
 sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
 sysdeps/x86_64/multiarch/strcmp.c             |   4 -
 sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
 sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
 sysdeps/x86_64/multiarch/strncmp.c            |   4 -
 sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
 10 files changed, 30 insertions(+), 202 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 51222dfab1..ed2def288d 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -58,7 +58,6 @@ sysdep_routines += \
   strcasecmp_l-evex \
   strcasecmp_l-sse2 \
   strcasecmp_l-sse4_2 \
-  strcasecmp_l-ssse3 \
   strcat-avx2 \
   strcat-avx2-rtm \
   strcat-evex \
@@ -80,7 +79,6 @@ sysdep_routines += \
   strcmp-sse2 \
   strcmp-sse2-unaligned \
   strcmp-sse4_2 \
-  strcmp-ssse3 \
   strcpy-avx2 \
   strcpy-avx2-rtm \
   strcpy-evex \
@@ -98,7 +96,6 @@ sysdep_routines += \
   strncase_l-evex \
   strncase_l-sse2 \
   strncase_l-sse4_2 \
-  strncase_l-ssse3 \
   strncat-avx2 \
   strncat-avx2-rtm \
   strncat-c \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncmp-evex \
   strncmp-sse2 \
   strncmp-sse4_2 \
-  strncmp-ssse3 \
   strncpy-avx2 \
   strncpy-avx2-rtm \
   strncpy-c \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f389928a4e..7e2be3554b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
 
   /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
@@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strcasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strcasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_sse2))
 
@@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strcmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strcmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
 
@@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
 			      __strncasecmp_sse2))
 
@@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 			      CPU_FEATURE_USABLE (SSE4_2),
 			      __strncasecmp_l_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __strncasecmp_l_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_sse2))
 
@@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strncmp_evex)
 	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
 			      __strncmp_sse42)
-	      IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
-			      __strncmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
 
 #ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
index 766539c241..296d32071b 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
@@ -20,7 +20,6 @@
 #include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
deleted file mode 100644
index fb2f9ae14a..0000000000
--- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strcasecmp_l_ssse3
-#define __strcasecmp __strcasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
deleted file mode 100644
index 1b7fa33c91..0000000000
--- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
+++ /dev/null
@@ -1,5 +0,0 @@
-#if IS_IN (libc)
-# define USE_SSSE3 1
-# define STRCMP __strcmp_ssse3
-# include "../strcmp.S"
-#endif
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
index 68cb73baad..a248c2a6e6 100644
--- a/sysdeps/x86_64/multiarch/strcmp.c
+++ b/sysdeps/x86_64/multiarch/strcmp.c
@@ -28,7 +28,6 @@
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
deleted file mode 100644
index 6728678688..0000000000
--- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
+++ /dev/null
@@ -1,6 +0,0 @@
-#define USE_SSSE3 1
-#define USE_AS_STRNCASECMP_L
-#define NO_NOLOCALE_ALIAS
-#define STRCMP __strncasecmp_l_ssse3
-#define __strncasecmp __strncasecmp_ssse3
-#include "../strcmp.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
deleted file mode 100644
index ec37308347..0000000000
--- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
+++ /dev/null
@@ -1,28 +0,0 @@
-/* strcmp optimized with SSSE3.
-   Copyright (C) 2017-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#define STRCMP __strncmp_ssse3
-
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(strcmp)
-
-#define USE_SSSE3 1
-#define USE_AS_STRNCMP
-#include <sysdeps/x86_64/strcmp.S>
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
index fca74199d8..70ae6547c9 100644
--- a/sysdeps/x86_64/multiarch/strncmp.c
+++ b/sysdeps/x86_64/multiarch/strncmp.c
@@ -27,7 +27,6 @@
 # include <init-arch.h>
 
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
@@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
     return OPTIMIZE (sse42);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
 
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index 99d8b36f1d..c38dc627f9 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -59,12 +59,7 @@
 # endif
 #endif
 
-#ifndef USE_SSSE3
 	.text
-#else
-	.section .text.ssse3,"ax",@progbits
-#endif
-
 #ifdef USE_AS_STRCASECMP_L
 # ifndef ENTRY2
 #  define ENTRY2(name) ENTRY (name)
@@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		 /* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4		/* store for next cycle */
 
-#ifndef USE_SSSE3
 	psrldq	$1, %xmm3
 	pslldq	$15, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$1, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$2, %xmm3
 	pslldq	$14, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$2, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$3, %xmm3
 	pslldq	$13, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$3, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$4, %xmm3
 	pslldq	$12, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$4, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$5, %xmm3
 	pslldq	$11, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$5, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$6, %xmm3
 	pslldq	$10, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$6, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$7, %xmm3
 	pslldq	$9, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$7, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$8, %xmm3
 	pslldq	$8, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$8, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$9, %xmm3
 	pslldq	$7, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$9, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$10, %xmm3
 	pslldq	$6, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$10, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$11, %xmm3
 	pslldq	$5, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$11, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$12, %xmm3
 	pslldq	$4, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$12, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$13, %xmm3
 	pslldq	$3, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$13, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$14, %xmm3
 	pslldq	$2, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$14, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
@@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
 	movdqa	(%rdi, %rcx), %xmm2
 	movdqa	%xmm2, %xmm4
 
-#ifndef USE_SSSE3
 	psrldq	$15, %xmm3
 	pslldq	$1, %xmm2
 	por	%xmm3, %xmm2		/* merge into one 16byte value */
-#else
-	palignr	$15, %xmm3, %xmm2	/* merge into one 16byte value */
-#endif
+
 	TOLOWER (%xmm1, %xmm2)
 
 	pcmpeqb	%xmm1, %xmm0
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3
  2022-04-14 16:47     ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-14 18:05       ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-04-14 18:05 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |   4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  16 --
>  sysdeps/x86_64/multiarch/ifunc-strcasecmp.h   |   4 -
>  sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S |   6 -
>  sysdeps/x86_64/multiarch/strcmp-ssse3.S       |   5 -
>  sysdeps/x86_64/multiarch/strcmp.c             |   4 -
>  sysdeps/x86_64/multiarch/strncase_l-ssse3.S   |   6 -
>  sysdeps/x86_64/multiarch/strncmp-ssse3.S      |  28 ----
>  sysdeps/x86_64/multiarch/strncmp.c            |   4 -
>  sysdeps/x86_64/strcmp.S                       | 155 ++++--------------
>  10 files changed, 30 insertions(+), 202 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcmp-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 51222dfab1..ed2def288d 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -58,7 +58,6 @@ sysdep_routines += \
>    strcasecmp_l-evex \
>    strcasecmp_l-sse2 \
>    strcasecmp_l-sse4_2 \
> -  strcasecmp_l-ssse3 \
>    strcat-avx2 \
>    strcat-avx2-rtm \
>    strcat-evex \
> @@ -80,7 +79,6 @@ sysdep_routines += \
>    strcmp-sse2 \
>    strcmp-sse2-unaligned \
>    strcmp-sse4_2 \
> -  strcmp-ssse3 \
>    strcpy-avx2 \
>    strcpy-avx2-rtm \
>    strcpy-evex \
> @@ -98,7 +96,6 @@ sysdep_routines += \
>    strncase_l-evex \
>    strncase_l-sse2 \
>    strncase_l-sse4_2 \
> -  strncase_l-ssse3 \
>    strncat-avx2 \
>    strncat-avx2-rtm \
>    strncat-c \
> @@ -110,7 +107,6 @@ sysdep_routines += \
>    strncmp-evex \
>    strncmp-sse2 \
>    strncmp-sse4_2 \
> -  strncmp-ssse3 \
>    strncpy-avx2 \
>    strncpy-avx2-rtm \
>    strncpy-c \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index f389928a4e..7e2be3554b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -448,9 +448,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strcasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strcasecmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c.  */
> @@ -469,9 +466,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strcasecmp_l_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcasecmp_l,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strcasecmp_l_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
>                               __strcasecmp_l_sse2))
>
> @@ -560,8 +554,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strcmp_evex)
>               IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
>                               __strcmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcmp, 1, __strcmp_sse2))
>
> @@ -604,9 +596,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strncasecmp,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strncasecmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncasecmp, 1,
>                               __strncasecmp_sse2))
>
> @@ -626,9 +615,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l,
>                               CPU_FEATURE_USABLE (SSE4_2),
>                               __strncasecmp_l_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncasecmp_l,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __strncasecmp_l_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
>                               __strncasecmp_l_sse2))
>
> @@ -1054,8 +1040,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __strncmp_evex)
>               IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
>                               __strncmp_sse42)
> -             IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncmp, 1, __strncmp_sse2))
>
>  #ifdef SHARED
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> index 766539c241..296d32071b 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h
> @@ -20,7 +20,6 @@
>  #include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -49,8 +48,5 @@ IFUNC_SELECTOR (void)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
>      return OPTIMIZE (sse42);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S b/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> deleted file mode 100644
> index fb2f9ae14a..0000000000
> --- a/sysdeps/x86_64/multiarch/strcasecmp_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strcasecmp_l_ssse3
> -#define __strcasecmp __strcasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strcmp-ssse3.S b/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> deleted file mode 100644
> index 1b7fa33c91..0000000000
> --- a/sysdeps/x86_64/multiarch/strcmp-ssse3.S
> +++ /dev/null
> @@ -1,5 +0,0 @@
> -#if IS_IN (libc)
> -# define USE_SSSE3 1
> -# define STRCMP __strcmp_ssse3
> -# include "../strcmp.S"
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
> index 68cb73baad..a248c2a6e6 100644
> --- a/sysdeps/x86_64/multiarch/strcmp.c
> +++ b/sysdeps/x86_64/multiarch/strcmp.c
> @@ -28,7 +28,6 @@
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -56,9 +55,6 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
>
> diff --git a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S b/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> deleted file mode 100644
> index 6728678688..0000000000
> --- a/sysdeps/x86_64/multiarch/strncase_l-ssse3.S
> +++ /dev/null
> @@ -1,6 +0,0 @@
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCASECMP_L
> -#define NO_NOLOCALE_ALIAS
> -#define STRCMP __strncasecmp_l_ssse3
> -#define __strncasecmp __strncasecmp_ssse3
> -#include "../strcmp.S"
> diff --git a/sysdeps/x86_64/multiarch/strncmp-ssse3.S b/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> deleted file mode 100644
> index ec37308347..0000000000
> --- a/sysdeps/x86_64/multiarch/strncmp-ssse3.S
> +++ /dev/null
> @@ -1,28 +0,0 @@
> -/* strcmp optimized with SSSE3.
> -   Copyright (C) 2017-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#define STRCMP __strncmp_ssse3
> -
> -#undef libc_hidden_builtin_def
> -#define libc_hidden_builtin_def(strcmp)
> -
> -#define USE_SSSE3 1
> -#define USE_AS_STRNCMP
> -#include <sysdeps/x86_64/strcmp.S>
> diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
> index fca74199d8..70ae6547c9 100644
> --- a/sysdeps/x86_64/multiarch/strncmp.c
> +++ b/sysdeps/x86_64/multiarch/strncmp.c
> @@ -27,7 +27,6 @@
>  # include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
> @@ -57,9 +56,6 @@ IFUNC_SELECTOR (void)
>        && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
>      return OPTIMIZE (sse42);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
>
> diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
> index 99d8b36f1d..c38dc627f9 100644
> --- a/sysdeps/x86_64/strcmp.S
> +++ b/sysdeps/x86_64/strcmp.S
> @@ -59,12 +59,7 @@
>  # endif
>  #endif
>
> -#ifndef USE_SSSE3
>         .text
> -#else
> -       .section .text.ssse3,"ax",@progbits
> -#endif
> -
>  #ifdef USE_AS_STRCASECMP_L
>  # ifndef ENTRY2
>  #  define ENTRY2(name) ENTRY (name)
> @@ -343,13 +338,10 @@ LABEL(gobble_ashr_1):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4             /* store for next cycle */
>
> -#ifndef USE_SSSE3
>         psrldq  $1, %xmm3
>         pslldq  $15, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -373,13 +365,10 @@ LABEL(gobble_ashr_1):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4            /* store for next cycle */
>
> -#ifndef USE_SSSE3
>         psrldq  $1, %xmm3
>         pslldq  $15, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $1, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -473,13 +462,10 @@ LABEL(gobble_ashr_2):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $2, %xmm3
>         pslldq  $14, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -504,13 +490,10 @@ LABEL(gobble_ashr_2):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $2, %xmm3
>         pslldq  $14, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $2, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -598,13 +581,10 @@ LABEL(gobble_ashr_3):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $3, %xmm3
>         pslldq  $13, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -629,13 +609,10 @@ LABEL(gobble_ashr_3):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $3, %xmm3
>         pslldq  $13, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $3, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -723,13 +700,10 @@ LABEL(gobble_ashr_4):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $4, %xmm3
>         pslldq  $12, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -754,13 +728,10 @@ LABEL(gobble_ashr_4):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $4, %xmm3
>         pslldq  $12, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $4, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -848,13 +819,10 @@ LABEL(gobble_ashr_5):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $5, %xmm3
>         pslldq  $11, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -879,13 +847,10 @@ LABEL(gobble_ashr_5):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $5, %xmm3
>         pslldq  $11, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $5, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -973,13 +938,10 @@ LABEL(gobble_ashr_6):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $6, %xmm3
>         pslldq  $10, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1004,13 +966,10 @@ LABEL(gobble_ashr_6):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $6, %xmm3
>         pslldq  $10, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $6, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1098,13 +1057,10 @@ LABEL(gobble_ashr_7):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $7, %xmm3
>         pslldq  $9, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1129,13 +1085,10 @@ LABEL(gobble_ashr_7):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $7, %xmm3
>         pslldq  $9, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $7, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1223,13 +1176,10 @@ LABEL(gobble_ashr_8):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $8, %xmm3
>         pslldq  $8, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1254,13 +1204,10 @@ LABEL(gobble_ashr_8):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $8, %xmm3
>         pslldq  $8, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $8, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1348,13 +1295,10 @@ LABEL(gobble_ashr_9):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $9, %xmm3
>         pslldq  $7, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1379,13 +1323,10 @@ LABEL(gobble_ashr_9):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $9, %xmm3
>         pslldq  $7, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $9, %xmm3, %xmm2        /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1473,13 +1414,10 @@ LABEL(gobble_ashr_10):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $10, %xmm3
>         pslldq  $6, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $10, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1504,13 +1442,10 @@ LABEL(gobble_ashr_10):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $10, %xmm3
>         pslldq  $6, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $10, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1598,13 +1533,10 @@ LABEL(gobble_ashr_11):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $11, %xmm3
>         pslldq  $5, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $11, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1629,13 +1561,10 @@ LABEL(gobble_ashr_11):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $11, %xmm3
>         pslldq  $5, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $11, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1723,13 +1652,10 @@ LABEL(gobble_ashr_12):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $12, %xmm3
>         pslldq  $4, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $12, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1754,13 +1680,10 @@ LABEL(gobble_ashr_12):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $12, %xmm3
>         pslldq  $4, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $12, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1848,13 +1771,10 @@ LABEL(gobble_ashr_13):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $13, %xmm3
>         pslldq  $3, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $13, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1879,13 +1799,10 @@ LABEL(gobble_ashr_13):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $13, %xmm3
>         pslldq  $3, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $13, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -1973,13 +1890,10 @@ LABEL(gobble_ashr_14):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $14, %xmm3
>         pslldq  $2, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $14, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2004,13 +1918,10 @@ LABEL(gobble_ashr_14):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $14, %xmm3
>         pslldq  $2, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $14, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2100,13 +2011,10 @@ LABEL(gobble_ashr_15):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $15, %xmm3
>         pslldq  $1, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $15, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> @@ -2131,13 +2039,10 @@ LABEL(gobble_ashr_15):
>         movdqa  (%rdi, %rcx), %xmm2
>         movdqa  %xmm2, %xmm4
>
> -#ifndef USE_SSSE3
>         psrldq  $15, %xmm3
>         pslldq  $1, %xmm2
>         por     %xmm3, %xmm2            /* merge into one 16byte value */
> -#else
> -       palignr $15, %xmm3, %xmm2       /* merge into one 16byte value */
> -#endif
> +
>         TOLOWER (%xmm1, %xmm2)
>
>         pcmpeqb %xmm1, %xmm0
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 3/6] x86: Remove str{n}cat-ssse3
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-04-14 16:47     ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-04-14 16:47     ` Noah Goldstein
  2022-04-14 18:06       ` H.J. Lu
  2022-04-14 16:47     ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
                       ` (3 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
 sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
 sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
 5 files changed, 879 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..2b3c625ea2 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -63,7 +63,6 @@ sysdep_routines += \
   strcat-evex \
   strcat-sse2 \
   strcat-sse2-unaligned \
-  strcat-ssse3 \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
@@ -101,7 +100,6 @@ sysdep_routines += \
   strncat-c \
   strncat-evex \
   strncat-sse2-unaligned \
-  strncat-ssse3 \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..41a04621ad 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcat_evex)
-	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
-			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
-	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
-			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
-   implementation gets merged.  */
-
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
-L(StartStrcpyPart):
-	mov	%rsi, %rcx
-	lea	(%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(StrncatExit0)
-	cmp	$8, %r8
-	jbe	L(StrncatExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	jb	L(StrncatExit15Bytes)
-# endif
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	je	L(StrncatExit16)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit1):
-	xor	%ah, %ah
-	movb	%ah, 1(%rdx)
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit2):
-	xor	%ah, %ah
-	movb	%ah, 2(%rdx)
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit3):
-	xor	%ah, %ah
-	movb	%ah, 3(%rdx)
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit4):
-	xor	%ah, %ah
-	movb	%ah, 4(%rdx)
-L(Exit4):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit5):
-	xor	%ah, %ah
-	movb	%ah, 5(%rdx)
-L(Exit5):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit6):
-	xor	%ah, %ah
-	movb	%ah, 6(%rdx)
-L(Exit6):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit7):
-	xor	%ah, %ah
-	movb	%ah, 7(%rdx)
-L(Exit7):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	3(%rcx), %eax
-	mov	%eax, 3(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8):
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-L(Exit8):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit9):
-	xor	%ah, %ah
-	movb	%ah, 9(%rdx)
-L(Exit9):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movb	8(%rcx), %al
-	movb	%al, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit10):
-	xor	%ah, %ah
-	movb	%ah, 10(%rdx)
-L(Exit10):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movw	8(%rcx), %ax
-	movw	%ax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit11):
-	xor	%ah, %ah
-	movb	%ah, 11(%rdx)
-L(Exit11):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit12):
-	xor	%ah, %ah
-	movb	%ah, 12(%rdx)
-L(Exit12):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit13):
-	xor	%ah, %ah
-	movb	%ah, 13(%rdx)
-L(Exit13):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	5(%rcx), %xmm1
-	movlpd	%xmm1, 5(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit14):
-	xor	%ah, %ah
-	movb	%ah, 14(%rdx)
-L(Exit14):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	6(%rcx), %xmm1
-	movlpd	%xmm1, 6(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15):
-	xor	%ah, %ah
-	movb	%ah, 15(%rdx)
-L(Exit15):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit16):
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-L(Exit16):
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-# ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase2):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$8, %r8
-	ja	L(ExitHighCase3)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase3):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit0):
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15Bytes):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8Bytes):
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v5 3/6] x86: Remove str{n}cat-ssse3
  2022-04-14 16:47     ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-14 18:06       ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-04-14 18:06 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
>  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
>  sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
>  sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
>  5 files changed, 879 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ed2def288d..2b3c625ea2 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -63,7 +63,6 @@ sysdep_routines += \
>    strcat-evex \
>    strcat-sse2 \
>    strcat-sse2-unaligned \
> -  strcat-ssse3 \
>    strchr-avx2 \
>    strchr-avx2-rtm \
>    strchr-evex \
> @@ -101,7 +100,6 @@ sysdep_routines += \
>    strncat-c \
>    strncat-evex \
>    strncat-sse2-unaligned \
> -  strncat-ssse3 \
>    strncmp-avx2 \
>    strncmp-avx2-rtm \
>    strncmp-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7e2be3554b..41a04621ad 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -481,8 +481,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcat_evex)
> -             IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcat_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
>
> @@ -630,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncat_evex)
> -             IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncat_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncat, 1,
>                               __strncat_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 5bece38f78..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -23,7 +23,6 @@
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index 9f39e4fcd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,866 +0,0 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -       xor     %eax, %eax
> -       cmpb    $0, (%rdi)
> -       jz      L(exit_tail0)
> -       cmpb    $0, 1(%rdi)
> -       jz      L(exit_tail1)
> -       cmpb    $0, 2(%rdi)
> -       jz      L(exit_tail2)
> -       cmpb    $0, 3(%rdi)
> -       jz      L(exit_tail3)
> -
> -       cmpb    $0, 4(%rdi)
> -       jz      L(exit_tail4)
> -       cmpb    $0, 5(%rdi)
> -       jz      L(exit_tail5)
> -       cmpb    $0, 6(%rdi)
> -       jz      L(exit_tail6)
> -       cmpb    $0, 7(%rdi)
> -       jz      L(exit_tail7)
> -
> -       cmpb    $0, 8(%rdi)
> -       jz      L(exit_tail8)
> -       cmpb    $0, 9(%rdi)
> -       jz      L(exit_tail9)
> -       cmpb    $0, 10(%rdi)
> -       jz      L(exit_tail10)
> -       cmpb    $0, 11(%rdi)
> -       jz      L(exit_tail11)
> -
> -       cmpb    $0, 12(%rdi)
> -       jz      L(exit_tail12)
> -       cmpb    $0, 13(%rdi)
> -       jz      L(exit_tail13)
> -       cmpb    $0, 14(%rdi)
> -       jz      L(exit_tail14)
> -       cmpb    $0, 15(%rdi)
> -       jz      L(exit_tail15)
> -       pxor    %xmm0, %xmm0
> -       lea     16(%rdi), %rcx
> -       lea     16(%rdi), %rax
> -       and     $-16, %rax
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       pxor    %xmm1, %xmm1
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       pxor    %xmm2, %xmm2
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       pxor    %xmm3, %xmm3
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       and     $-0x40, %rax
> -
> -       .p2align 4
> -L(aligned_64):
> -       pcmpeqb (%rax), %xmm0
> -       pcmpeqb 16(%rax), %xmm1
> -       pcmpeqb 32(%rax), %xmm2
> -       pcmpeqb 48(%rax), %xmm3
> -       pmovmskb %xmm0, %edx
> -       pmovmskb %xmm1, %r11d
> -       pmovmskb %xmm2, %r10d
> -       pmovmskb %xmm3, %r9d
> -       or      %edx, %r9d
> -       or      %r11d, %r9d
> -       or      %r10d, %r9d
> -       lea     64(%rax), %rax
> -       jz      L(aligned_64)
> -
> -       test    %edx, %edx
> -       jnz     L(aligned_64_exit_16)
> -       test    %r11d, %r11d
> -       jnz     L(aligned_64_exit_32)
> -       test    %r10d, %r10d
> -       jnz     L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> -       pmovmskb %xmm3, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_48):
> -       lea     -16(%rax), %rax
> -       mov     %r10d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_32):
> -       lea     -32(%rax), %rax
> -       mov     %r11d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_16):
> -       lea     -48(%rax), %rax
> -
> -L(exit):
> -       sub     %rcx, %rax
> -       test    %dl, %dl
> -       jz      L(exit_high)
> -       test    $0x01, %dl
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dl
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dl
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dl
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dl
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dl
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dl
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -L(exit_tail0):
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_high):
> -       add     $8, %eax
> -       test    $0x01, %dh
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dh
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dh
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dh
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dh
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dh
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dh
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail1):
> -       add     $1, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail2):
> -       add     $2, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail3):
> -       add     $3, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail4):
> -       add     $4, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail5):
> -       add     $5, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail6):
> -       add     $6, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail7):
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail8):
> -       add     $8, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail9):
> -       add     $9, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail10):
> -       add     $10, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail11):
> -       add     $11, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail12):
> -       add     $12, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail13):
> -       add     $13, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail14):
> -       add     $14, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail15):
> -       add     $15, %eax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       mov     %rsi, %rcx
> -       lea     (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(StrncatExit0)
> -       cmp     $8, %r8
> -       jbe     L(StrncatExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       jb      L(StrncatExit15Bytes)
> -# endif
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       je      L(StrncatExit16)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit1):
> -       xor     %ah, %ah
> -       movb    %ah, 1(%rdx)
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit2):
> -       xor     %ah, %ah
> -       movb    %ah, 2(%rdx)
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit3):
> -       xor     %ah, %ah
> -       movb    %ah, 3(%rdx)
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit4):
> -       xor     %ah, %ah
> -       movb    %ah, 4(%rdx)
> -L(Exit4):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit5):
> -       xor     %ah, %ah
> -       movb    %ah, 5(%rdx)
> -L(Exit5):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit6):
> -       xor     %ah, %ah
> -       movb    %ah, 6(%rdx)
> -L(Exit6):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit7):
> -       xor     %ah, %ah
> -       movb    %ah, 7(%rdx)
> -L(Exit7):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     3(%rcx), %eax
> -       mov     %eax, 3(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8):
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -L(Exit8):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit9):
> -       xor     %ah, %ah
> -       movb    %ah, 9(%rdx)
> -L(Exit9):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movb    8(%rcx), %al
> -       movb    %al, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit10):
> -       xor     %ah, %ah
> -       movb    %ah, 10(%rdx)
> -L(Exit10):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movw    8(%rcx), %ax
> -       movw    %ax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit11):
> -       xor     %ah, %ah
> -       movb    %ah, 11(%rdx)
> -L(Exit11):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit12):
> -       xor     %ah, %ah
> -       movb    %ah, 12(%rdx)
> -L(Exit12):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit13):
> -       xor     %ah, %ah
> -       movb    %ah, 13(%rdx)
> -L(Exit13):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  5(%rcx), %xmm1
> -       movlpd  %xmm1, 5(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit14):
> -       xor     %ah, %ah
> -       movb    %ah, 14(%rdx)
> -L(Exit14):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  6(%rcx), %xmm1
> -       movlpd  %xmm1, 6(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15):
> -       xor     %ah, %ah
> -       movb    %ah, 15(%rdx)
> -L(Exit15):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit16):
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -L(Exit16):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $8, %r8
> -       ja      L(ExitHighCase3)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase3):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit0):
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8Bytes):
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 6c45ff3ec7..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-04-14 16:47     ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-04-14 16:47     ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-04-14 16:47     ` Noah Goldstein
  2022-04-14 18:10       ` H.J. Lu
  2022-04-14 16:47     ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
                       ` (2 subsequent siblings)
  5 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 2b3c625ea2..5b02ec8de5 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -46,13 +46,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -83,7 +81,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -110,7 +107,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 41a04621ad..49ce6860d0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3
  2022-04-14 16:47     ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-14 18:10       ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-04-14 18:10 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
>  sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
>  sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
>  sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
>  sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
>  6 files changed, 3572 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 2b3c625ea2..5b02ec8de5 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -46,13 +46,11 @@ sysdep_routines += \
>    stpcpy-evex \
>    stpcpy-sse2 \
>    stpcpy-sse2-unaligned \
> -  stpcpy-ssse3 \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
>    stpncpy-c \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
> -  stpncpy-ssse3 \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
>    strcasecmp_l-evex \
> @@ -83,7 +81,6 @@ sysdep_routines += \
>    strcpy-evex \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
> -  strcpy-ssse3 \
>    strcspn-c \
>    strcspn-sse2 \
>    strlen-avx2 \
> @@ -110,7 +107,6 @@ sysdep_routines += \
>    strncpy-c \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
> -  strncpy-ssse3 \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 41a04621ad..49ce6860d0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -399,8 +399,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> -             IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpncpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -417,8 +415,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> -             IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpcpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -567,8 +563,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -644,8 +638,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
>                               __strncpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_ssse3
> -#  endif
> -
> -       .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> -       mov     %rsi, %rcx
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -#  endif
> -       mov     %rdi, %rdx
> -#  ifdef USE_AS_STRNCPY
> -       test    %R8_LP, %R8_LP
> -       jz      L(Exit0)
> -       cmp     $8, %R8_LP
> -       jbe     L(StrncpyExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       jb      L(StrncpyExit15Bytes)
> -# endif
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -# endif
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       mov     %rcx, %rsi
> -       sub     $16, %r8
> -       and     $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> -       add     %rsi, %r8
> -# endif
> -       lea     16(%rcx), %rsi
> -       and     $-16, %rsi
> -       pxor    %xmm0, %xmm0
> -       mov     (%rcx), %r9
> -       mov     %r9, (%rdx)
> -       pcmpeqb (%rsi), %xmm0
> -       mov     8(%rcx), %r9
> -       mov     %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> -       pmovmskb %xmm0, %rax
> -       sub     %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       mov     %rdx, %rax
> -       lea     16(%rdx), %rdx
> -       and     $-16, %rdx
> -       sub     %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %rsi
> -       lea     -1(%rsi), %rsi
> -       and     $1<<31, %esi
> -       test    %rsi, %rsi
> -       jnz     L(ContinueCopy)
> -       lea     16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> -       sub     %rax, %rcx
> -       mov     %rcx, %rax
> -       and     $0xf, %rax
> -       mov     $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> -       jz      L(Align16Both)
> -
> -       cmp     $8, %rax
> -       jae     L(ShlHigh8)
> -       cmp     $1, %rax
> -       je      L(Shl1)
> -       cmp     $2, %rax
> -       je      L(Shl2)
> -       cmp     $3, %rax
> -       je      L(Shl3)
> -       cmp     $4, %rax
> -       je      L(Shl4)
> -       cmp     $5, %rax
> -       je      L(Shl5)
> -       cmp     $6, %rax
> -       je      L(Shl6)
> -       jmp     L(Shl7)
> -
> -L(ShlHigh8):
> -       je      L(Shl8)
> -       cmp     $9, %rax
> -       je      L(Shl9)
> -       cmp     $10, %rax
> -       je      L(Shl10)
> -       cmp     $11, %rax
> -       je      L(Shl11)
> -       cmp     $12, %rax
> -       je      L(Shl12)
> -       cmp     $13, %rax
> -       je      L(Shl13)
> -       cmp     $14, %rax
> -       je      L(Shl14)
> -       jmp     L(Shl15)
> -
> -L(Align16Both):
> -       movaps  (%rcx), %xmm1
> -       movaps  16(%rcx), %xmm2
> -       movaps  %xmm1, (%rdx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm4
> -       movaps  %xmm3, (%rdx, %rsi)
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm1
> -       movaps  %xmm4, (%rdx, %rsi)
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm2
> -       movaps  %xmm1, (%rdx, %rsi)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm3, (%rdx, %rsi)
> -       mov     %rcx, %rax
> -       lea     16(%rcx, %rsi), %rcx
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       lea     112(%r8, %rax), %r8
> -# endif
> -       mov     $-0x40, %rsi
> -
> -       .p2align 4
> -L(Aligned64Loop):
> -       movaps  (%rcx), %xmm2
> -       movaps  %xmm2, %xmm4
> -       movaps  16(%rcx), %xmm5
> -       movaps  32(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  48(%rcx), %xmm7
> -       pminub  %xmm5, %xmm2
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %rax
> -       lea     64(%rdx), %rdx
> -       lea     64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeaveCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Aligned64Leave)
> -       movaps  %xmm4, -64(%rdx)
> -       movaps  %xmm5, -48(%rdx)
> -       movaps  %xmm6, -32(%rdx)
> -       movaps  %xmm7, -16(%rdx)
> -       jmp     L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> -       lea     48(%r8), %r8
> -# endif
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm6, -32(%rdx)
> -       pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl1):
> -       movaps  -1(%rcx), %xmm1
> -       movaps  15(%rcx), %xmm2
> -L(Shl1Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     31(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -15(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl1LoopStart):
> -       movaps  15(%rcx), %xmm2
> -       movaps  31(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  47(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  63(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $1, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $1, %xmm3, %xmm4
> -       jnz     L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave1)
> -# endif
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> -       movdqu  -1(%rcx), %xmm1
> -       mov     $15, %rsi
> -       movdqu  %xmm1, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl2):
> -       movaps  -2(%rcx), %xmm1
> -       movaps  14(%rcx), %xmm2
> -L(Shl2Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     30(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -14(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl2LoopStart):
> -       movaps  14(%rcx), %xmm2
> -       movaps  30(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  46(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  62(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $2, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $2, %xmm3, %xmm4
> -       jnz     L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave2)
> -# endif
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> -       movdqu  -2(%rcx), %xmm1
> -       mov     $14, %rsi
> -       movdqu  %xmm1, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl3):
> -       movaps  -3(%rcx), %xmm1
> -       movaps  13(%rcx), %xmm2
> -L(Shl3Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     29(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -13(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl3LoopStart):
> -       movaps  13(%rcx), %xmm2
> -       movaps  29(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  45(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  61(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $3, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $3, %xmm3, %xmm4
> -       jnz     L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave3)
> -# endif
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> -       movdqu  -3(%rcx), %xmm1
> -       mov     $13, %rsi
> -       movdqu  %xmm1, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl4):
> -       movaps  -4(%rcx), %xmm1
> -       movaps  12(%rcx), %xmm2
> -L(Shl4Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     28(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -12(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl4LoopStart):
> -       movaps  12(%rcx), %xmm2
> -       movaps  28(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  44(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  60(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $4, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $4, %xmm3, %xmm4
> -       jnz     L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave4)
> -# endif
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> -       movdqu  -4(%rcx), %xmm1
> -       mov     $12, %rsi
> -       movdqu  %xmm1, -4(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl5):
> -       movaps  -5(%rcx), %xmm1
> -       movaps  11(%rcx), %xmm2
> -L(Shl5Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     27(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -11(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl5LoopStart):
> -       movaps  11(%rcx), %xmm2
> -       movaps  27(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  43(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  59(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $5, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $5, %xmm3, %xmm4
> -       jnz     L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave5)
> -# endif
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> -       movdqu  -5(%rcx), %xmm1
> -       mov     $11, %rsi
> -       movdqu  %xmm1, -5(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl6):
> -       movaps  -6(%rcx), %xmm1
> -       movaps  10(%rcx), %xmm2
> -L(Shl6Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     26(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -10(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl6LoopStart):
> -       movaps  10(%rcx), %xmm2
> -       movaps  26(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  42(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  58(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $6, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $6, %xmm3, %xmm4
> -       jnz     L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave6)
> -# endif
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> -       mov     (%rcx), %r9
> -       mov     6(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 6(%rdx)
> -       mov     $10, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl7):
> -       movaps  -7(%rcx), %xmm1
> -       movaps  9(%rcx), %xmm2
> -L(Shl7Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     25(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -9(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl7LoopStart):
> -       movaps  9(%rcx), %xmm2
> -       movaps  25(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  41(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  57(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $7, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $7, %xmm3, %xmm4
> -       jnz     L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave7)
> -# endif
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> -       mov     (%rcx), %r9
> -       mov     5(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 5(%rdx)
> -       mov     $9, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl8):
> -       movaps  -8(%rcx), %xmm1
> -       movaps  8(%rcx), %xmm2
> -L(Shl8Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     24(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -8(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl8LoopStart):
> -       movaps  8(%rcx), %xmm2
> -       movaps  24(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  40(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  56(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $8, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $8, %xmm3, %xmm4
> -       jnz     L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave8)
> -# endif
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl9):
> -       movaps  -9(%rcx), %xmm1
> -       movaps  7(%rcx), %xmm2
> -L(Shl9Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     23(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -7(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl9LoopStart):
> -       movaps  7(%rcx), %xmm2
> -       movaps  23(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  39(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  55(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $9, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $9, %xmm3, %xmm4
> -       jnz     L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave9)
> -# endif
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl10):
> -       movaps  -10(%rcx), %xmm1
> -       movaps  6(%rcx), %xmm2
> -L(Shl10Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     22(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -6(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl10LoopStart):
> -       movaps  6(%rcx), %xmm2
> -       movaps  22(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  38(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  54(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $10, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $10, %xmm3, %xmm4
> -       jnz     L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave10)
> -# endif
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl11):
> -       movaps  -11(%rcx), %xmm1
> -       movaps  5(%rcx), %xmm2
> -L(Shl11Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     21(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -5(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl11LoopStart):
> -       movaps  5(%rcx), %xmm2
> -       movaps  21(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  37(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  53(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $11, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $11, %xmm3, %xmm4
> -       jnz     L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave11)
> -# endif
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl12):
> -       movaps  -12(%rcx), %xmm1
> -       movaps  4(%rcx), %xmm2
> -L(Shl12Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     20(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -4(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl12LoopStart):
> -       movaps  4(%rcx), %xmm2
> -       movaps  20(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  36(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  52(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $12, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $12, %xmm3, %xmm4
> -       jnz     L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave12)
> -# endif
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl13):
> -       movaps  -13(%rcx), %xmm1
> -       movaps  3(%rcx), %xmm2
> -L(Shl13Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     19(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -3(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl13LoopStart):
> -       movaps  3(%rcx), %xmm2
> -       movaps  19(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  35(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  51(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $13, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $13, %xmm3, %xmm4
> -       jnz     L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave13)
> -# endif
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl14):
> -       movaps  -14(%rcx), %xmm1
> -       movaps  2(%rcx), %xmm2
> -L(Shl14Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     18(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -2(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl14LoopStart):
> -       movaps  2(%rcx), %xmm2
> -       movaps  18(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  34(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  50(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $14, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $14, %xmm3, %xmm4
> -       jnz     L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave14)
> -# endif
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl15):
> -       movaps  -15(%rcx), %xmm1
> -       movaps  1(%rcx), %xmm2
> -L(Shl15Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     17(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -1(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl15LoopStart):
> -       movaps  1(%rcx), %xmm2
> -       movaps  17(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  33(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  49(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $15, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $15, %xmm3, %xmm4
> -       jnz     L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave15)
> -# endif
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> -       jmp     L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -#  ifdef USE_AS_STRNCPY
> -       add     $16, %r8
> -#  endif
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -
> -       .p2align 4
> -L(Exit8):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $8, %r8
> -       lea     8(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -
> -       .p2align 4
> -L(Exit16):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %rax
> -       mov     %rax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     15(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       lea     16(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       jmp     L(Exit8)
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $15, %r8
> -       je      L(Exit15)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       jmp     L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -       cmp     $8, %r8
> -       je      L(Exit8)
> -       jg      L(More8Case3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       jg      L(More4Case3)
> -       cmp     $2, %r8
> -       jl      L(Exit1)
> -       je      L(Exit2)
> -       jg      L(Exit3)
> -L(More8Case3): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       jl      L(Less12Case3)
> -       cmp     $14, %r8
> -       jl      L(Exit13)
> -       je      L(Exit14)
> -       jg      L(Exit15)
> -L(More4Case3): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Exit5)
> -       je      L(Exit6)
> -       jg      L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Exit9)
> -       je      L(Exit10)
> -       jg      L(Exit11)
> -#  endif
> -
> -       .p2align 4
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $1, %r8
> -       lea     1(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $2, %r8
> -       lea     2(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $3, %r8
> -       lea     3(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit4):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     3(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $4, %r8
> -       lea     4(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit5):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     4(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $5, %r8
> -       lea     5(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit6):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     5(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $6, %r8
> -       lea     6(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit7):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movl    3(%rcx), %eax
> -       movl    %eax, 3(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     6(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $7, %r8
> -       lea     7(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit9):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %eax
> -       mov     %eax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     8(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $9, %r8
> -       lea     9(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit10):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %eax
> -       mov     %eax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     9(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $10, %r8
> -       lea     10(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit11):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     10(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $11, %r8
> -       lea     11(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit12):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     11(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $12, %r8
> -       lea     12(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit13):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %rax
> -       mov     %rax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     12(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $13, %r8
> -       lea     13(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit14):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %rax
> -       mov     %rax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     13(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $14, %r8
> -       lea     14(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit15):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $15, %r8
> -       lea     15(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(Fill0):
> -       ret
> -
> -       .p2align 4
> -L(Fill1):
> -       movb    %dl, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill2):
> -       movw    %dx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill3):
> -       movw    %dx, (%rcx)
> -       movb    %dl, 2(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill4):
> -       movl    %edx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill5):
> -       movl    %edx, (%rcx)
> -       movb    %dl, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill6):
> -       movl    %edx, (%rcx)
> -       movw    %dx, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill7):
> -       movl    %edx, (%rcx)
> -       movl    %edx, 3(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill8):
> -       mov     %rdx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill9):
> -       mov     %rdx, (%rcx)
> -       movb    %dl, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill10):
> -       mov     %rdx, (%rcx)
> -       movw    %dx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill11):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill12):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill13):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 5(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill14):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 6(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill15):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill16):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(StrncpyFillExit1):
> -       lea     16(%r8), %r8
> -L(FillFrom1To16Bytes):
> -       test    %r8, %r8
> -       jz      L(Fill0)
> -       cmp     $16, %r8
> -       je      L(Fill16)
> -       cmp     $8, %r8
> -       je      L(Fill8)
> -       jg      L(FillMore8)
> -       cmp     $4, %r8
> -       je      L(Fill4)
> -       jg      L(FillMore4)
> -       cmp     $2, %r8
> -       jl      L(Fill1)
> -       je      L(Fill2)
> -       jg      L(Fill3)
> -L(FillMore8): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Fill12)
> -       jl      L(FillLess12)
> -       cmp     $14, %r8
> -       jl      L(Fill13)
> -       je      L(Fill14)
> -       jg      L(Fill15)
> -L(FillMore4): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Fill5)
> -       je      L(Fill6)
> -       jg      L(Fill7)
> -L(FillLess12): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Fill9)
> -       je      L(Fill10)
> -       jmp     L(Fill11)
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero1):
> -       xor     %rdx, %rdx
> -       sub     $16, %r8
> -       jbe     L(StrncpyFillExit1)
> -
> -       pxor    %xmm0, %xmm0
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -
> -       lea     16(%rcx), %rcx
> -
> -       mov     %rcx, %rdx
> -       and     $0xf, %rdx
> -       sub     %rdx, %rcx
> -       add     %rdx, %r8
> -       xor     %rdx, %rdx
> -       sub     $64, %r8
> -       jb      L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       movdqa  %xmm0, 32(%rcx)
> -       movdqa  %xmm0, 48(%rcx)
> -       lea     64(%rcx), %rcx
> -       sub     $64, %r8
> -       jae     L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> -       add     $32, %r8
> -       jl      L(StrncpyFillLess32)
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       lea     32(%rcx), %rcx
> -       sub     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> -       add     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Exit0):
> -       mov     %rdx, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit8Bytes):
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -#  endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> -       lea     64(%r8), %r8
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       add     $48, %r8
> -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> -       .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> -       movdqu  -1(%rcx), %xmm0
> -       movdqu  %xmm0, -1(%rdx)
> -       mov     $15, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> -       movdqu  -2(%rcx), %xmm0
> -       movdqu  %xmm0, -2(%rdx)
> -       mov     $14, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> -       movdqu  -3(%rcx), %xmm0
> -       movdqu  %xmm0, -3(%rdx)
> -       mov     $13, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> -       movdqu  -4(%rcx), %xmm0
> -       movdqu  %xmm0, -4(%rdx)
> -       mov     $12, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> -       movdqu  -5(%rcx), %xmm0
> -       movdqu  %xmm0, -5(%rdx)
> -       mov     $11, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     6(%rcx), %r9d
> -       mov     %r9d, 6(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $10, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     5(%rcx), %r9d
> -       mov     %r9d, 5(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $9, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave1):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit1)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit1):
> -       lea     15(%rdx, %rsi), %rdx
> -       lea     15(%rcx, %rsi), %rcx
> -       mov     -15(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -15(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave2):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit2)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit2):
> -       lea     14(%rdx, %rsi), %rdx
> -       lea     14(%rcx, %rsi), %rcx
> -       mov     -14(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -14(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave3):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit3)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit3):
> -       lea     13(%rdx, %rsi), %rdx
> -       lea     13(%rcx, %rsi), %rcx
> -       mov     -13(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -13(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave4):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit4)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit4):
> -       lea     12(%rdx, %rsi), %rdx
> -       lea     12(%rcx, %rsi), %rcx
> -       mov     -12(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -12(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave5):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit5)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit5):
> -       lea     11(%rdx, %rsi), %rdx
> -       lea     11(%rcx, %rsi), %rcx
> -       mov     -11(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -11(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave6):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit6)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit6):
> -       lea     10(%rdx, %rsi), %rdx
> -       lea     10(%rcx, %rsi), %rcx
> -       mov     -10(%rcx), %rsi
> -       movw    -2(%rcx), %ax
> -       mov     %rsi, -10(%rdx)
> -       movw    %ax, -2(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave7):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit7)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit7):
> -       lea     9(%rdx, %rsi), %rdx
> -       lea     9(%rcx, %rsi), %rcx
> -       mov     -9(%rcx), %rsi
> -       movb    -1(%rcx), %ah
> -       mov     %rsi, -9(%rdx)
> -       movb    %ah, -1(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave8):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit8)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit8):
> -       lea     8(%rdx, %rsi), %rdx
> -       lea     8(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave9):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit9)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit9):
> -       lea     7(%rdx, %rsi), %rdx
> -       lea     7(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave10):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit10)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit10):
> -       lea     6(%rdx, %rsi), %rdx
> -       lea     6(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave11):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit11)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit11):
> -       lea     5(%rdx, %rsi), %rdx
> -       lea     5(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave12):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit12)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit12):
> -       lea     4(%rdx, %rsi), %rdx
> -       lea     4(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave13):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit13)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit13):
> -       lea     3(%rdx, %rsi), %rdx
> -       lea     3(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave14):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit14)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit14):
> -       lea     2(%rdx, %rsi), %rdx
> -       lea     2(%rcx, %rsi), %rcx
> -       movw    -2(%rcx), %ax
> -       xor     %rsi, %rsi
> -       movw    %ax, -2(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave15):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit15)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit15):
> -       lea     1(%rdx, %rsi), %rdx
> -       lea     1(%rcx, %rsi), %rcx
> -       movb    -1(%rcx), %ah
> -       xor     %rsi, %rsi
> -       movb    %ah, -1(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (2 preceding siblings ...)
  2022-04-14 16:47     ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-04-14 16:47     ` Noah Goldstein
  2022-04-14 18:13       ` H.J. Lu
  2022-04-14 16:47     ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
  2022-04-14 18:04     ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
  5 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result it is no longer worth it to keep the SSSE3
versions given the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |   16 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 6 insertions(+), 3212 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 5b02ec8de5..303fb5d734 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -17,7 +17,6 @@ sysdep_routines += \
   memcmpeq-evex \
   memcmpeq-sse2 \
   memcpy-ssse3 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
@@ -25,7 +24,6 @@ sysdep_routines += \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
   memmove-ssse3 \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 49ce6860d0..c6008a73ed 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3)
@@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
@@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3)
@@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
@@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3)
@@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..fb01fbb301 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      return OPTIMIZE (ssse3);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-04-14 16:47     ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-14 18:13       ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-04-14 18:13 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
>  sysdeps/x86_64/multiarch/ifunc-memmove.h      |   16 +-
>  sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
>  sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
>  5 files changed, 6 insertions(+), 3212 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
>  delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 5b02ec8de5..303fb5d734 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -17,7 +17,6 @@ sysdep_routines += \
>    memcmpeq-evex \
>    memcmpeq-sse2 \
>    memcpy-ssse3 \
> -  memcpy-ssse3-back \
>    memmove-avx-unaligned-erms \
>    memmove-avx-unaligned-erms-rtm \
>    memmove-avx512-no-vzeroupper \
> @@ -25,7 +24,6 @@ sysdep_routines += \
>    memmove-evex-unaligned-erms \
>    memmove-sse2-unaligned-erms \
>    memmove-ssse3 \
> -  memmove-ssse3-back \
>    memrchr-avx2 \
>    memrchr-avx2-rtm \
>    memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 49ce6860d0..c6008a73ed 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memmove_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_chk_ssse3)
> @@ -177,8 +174,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_avx512_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_ssse3)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
> @@ -872,9 +867,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_chk_ssse3)
> @@ -908,8 +900,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, memcpy,
> @@ -958,9 +948,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_chk_ssse3)
> @@ -1003,8 +990,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, mempcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..fb01fbb301 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -25,7 +25,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
>    attribute_hidden;
> @@ -94,17 +93,14 @@ IFUNC_SELECTOR (void)
>         }
>      }
>
> -  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> -      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
>      {
> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -       return OPTIMIZE (sse2_unaligned_erms);
> -
> -      return OPTIMIZE (sse2_unaligned);
> +      return OPTIMIZE (ssse3);
>      }
>
> -  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> -    return OPTIMIZE (ssse3_back);
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +    return OPTIMIZE (sse2_unaligned_erms);
>
> -  return OPTIMIZE (ssse3);
> +  return OPTIMIZE (sse2_unaligned);
>  }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY                __memcpy_ssse3_back
> -# define MEMCPY_CHK    __memcpy_chk_ssse3_back
> -# define MEMPCPY       __mempcpy_ssse3_back
> -# define MEMPCPY_CHK   __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B)   I - B
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -   relative offsets.  INDEX is a register contains the index into the
> -   jump table.  SCALE is the scale of INDEX.  */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
> -  lea          TABLE(%rip), %r11;                              \
> -  movslq       (%r11, INDEX, SCALE), INDEX;                    \
> -  lea          (%r11, INDEX), INDEX;                           \
> -  _CET_NOTRACK jmp *INDEX;                                     \
> -  ud2
> -
> -       .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> -       mov     %RDI_LP, %RAX_LP
> -       add     %RDX_LP, %RAX_LP
> -       jmp     L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> -       mov     %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> -       add     %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> -       cmp     %rsi, %rdi
> -       jb      L(copy_forward)
> -       je      L(bwd_write_0bytes)
> -       cmp     $144, %rdx
> -       jae     L(copy_backward)
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> -       cmp     $144, %rdx
> -       jae     L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jbe     L(bk_write)
> -#endif
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> -       .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jle     L(copy_backward)
> -#endif
> -       movdqu  (%rsi), %xmm0
> -       mov     %rdi, %r8
> -       and     $-16, %rdi
> -       add     $16, %rdi
> -       mov     %rdi, %r9
> -       sub     %r8, %r9
> -       sub     %r9, %rdx
> -       add     %r9, %rsi
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       cmp     %rcx, %rdx
> -       jae     L(gobble_mem_fwd)
> -       lea     L(shl_table_fwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       shl     $1, %rcx
> -       cmp     %rcx, %rdx
> -       ja      L(gobble_mem_bwd)
> -
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $0xf, %r9
> -       xor     %r9, %rdi
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0_bwd)
> -       lea     L(shl_table_bwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(shl_0):
> -
> -       mov     %rdx, %r9
> -       shr     $8, %r9
> -       add     %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> -       cmp     $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> -       jae     L(gobble_mem_fwd)
> -       sub     $0x80, %rdx
> -       .p2align 4
> -L(shl_0_loop):
> -       movdqa  (%rsi), %xmm1
> -       movdqa  %xmm1, (%rdi)
> -       movaps  0x10(%rsi), %xmm2
> -       movaps  %xmm2, 0x10(%rdi)
> -       movaps  0x20(%rsi), %xmm3
> -       movaps  %xmm3, 0x20(%rdi)
> -       movaps  0x30(%rsi), %xmm4
> -       movaps  %xmm4, 0x30(%rdi)
> -       movaps  0x40(%rsi), %xmm1
> -       movaps  %xmm1, 0x40(%rdi)
> -       movaps  0x50(%rsi), %xmm2
> -       movaps  %xmm2, 0x50(%rdi)
> -       movaps  0x60(%rsi), %xmm3
> -       movaps  %xmm3, 0x60(%rdi)
> -       movaps  0x70(%rsi), %xmm4
> -       movaps  %xmm4, 0x70(%rdi)
> -       sub     $0x80, %rdx
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_0_loop)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_bwd):
> -       sub     $0x80, %rdx
> -L(copy_backward_loop):
> -       movaps  -0x10(%rsi), %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  -0x20(%rsi), %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -       movaps  -0x30(%rsi), %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -       movaps  -0x40(%rsi), %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -       movaps  -0x50(%rsi), %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -       movaps  -0x60(%rsi), %xmm5
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  -0x70(%rsi), %xmm5
> -       movaps  %xmm5, -0x70(%rdi)
> -       movaps  -0x80(%rsi), %xmm5
> -       movaps  %xmm5, -0x80(%rdi)
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(copy_backward_loop)
> -
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1):
> -       sub     $0x80, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       movaps  0x0f(%rsi), %xmm2
> -       movaps  0x1f(%rsi), %xmm3
> -       movaps  0x2f(%rsi), %xmm4
> -       movaps  0x3f(%rsi), %xmm5
> -       movaps  0x4f(%rsi), %xmm6
> -       movaps  0x5f(%rsi), %xmm7
> -       movaps  0x6f(%rsi), %xmm8
> -       movaps  0x7f(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $1, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $1, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $1, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $1, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $1, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $1, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $1, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_1)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1_bwd):
> -       movaps  -0x01(%rsi), %xmm1
> -
> -       movaps  -0x11(%rsi), %xmm2
> -       palignr $1, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x21(%rsi), %xmm3
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x31(%rsi), %xmm4
> -       palignr $1, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x41(%rsi), %xmm5
> -       palignr $1, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x51(%rsi), %xmm6
> -       palignr $1, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x61(%rsi), %xmm7
> -       palignr $1, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x71(%rsi), %xmm8
> -       palignr $1, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x81(%rsi), %xmm9
> -       palignr $1, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_1_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2):
> -       sub     $0x80, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       movaps  0x0e(%rsi), %xmm2
> -       movaps  0x1e(%rsi), %xmm3
> -       movaps  0x2e(%rsi), %xmm4
> -       movaps  0x3e(%rsi), %xmm5
> -       movaps  0x4e(%rsi), %xmm6
> -       movaps  0x5e(%rsi), %xmm7
> -       movaps  0x6e(%rsi), %xmm8
> -       movaps  0x7e(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $2, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $2, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $2, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $2, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $2, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $2, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $2, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_2)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2_bwd):
> -       movaps  -0x02(%rsi), %xmm1
> -
> -       movaps  -0x12(%rsi), %xmm2
> -       palignr $2, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x22(%rsi), %xmm3
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x32(%rsi), %xmm4
> -       palignr $2, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x42(%rsi), %xmm5
> -       palignr $2, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x52(%rsi), %xmm6
> -       palignr $2, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x62(%rsi), %xmm7
> -       palignr $2, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x72(%rsi), %xmm8
> -       palignr $2, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x82(%rsi), %xmm9
> -       palignr $2, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_2_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3):
> -       sub     $0x80, %rdx
> -       movaps -0x03(%rsi), %xmm1
> -       movaps  0x0d(%rsi), %xmm2
> -       movaps  0x1d(%rsi), %xmm3
> -       movaps  0x2d(%rsi), %xmm4
> -       movaps  0x3d(%rsi), %xmm5
> -       movaps  0x4d(%rsi), %xmm6
> -       movaps  0x5d(%rsi), %xmm7
> -       movaps  0x6d(%rsi), %xmm8
> -       movaps  0x7d(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $3, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $3, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $3, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $3, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $3, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $3, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $3, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_3)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3_bwd):
> -       movaps  -0x03(%rsi), %xmm1
> -
> -       movaps  -0x13(%rsi), %xmm2
> -       palignr $3, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x23(%rsi), %xmm3
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x33(%rsi), %xmm4
> -       palignr $3, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x43(%rsi), %xmm5
> -       palignr $3, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x53(%rsi), %xmm6
> -       palignr $3, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x63(%rsi), %xmm7
> -       palignr $3, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x73(%rsi), %xmm8
> -       palignr $3, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x83(%rsi), %xmm9
> -       palignr $3, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_3_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4):
> -       sub     $0x80, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       movaps  0x0c(%rsi), %xmm2
> -       movaps  0x1c(%rsi), %xmm3
> -       movaps  0x2c(%rsi), %xmm4
> -       movaps  0x3c(%rsi), %xmm5
> -       movaps  0x4c(%rsi), %xmm6
> -       movaps  0x5c(%rsi), %xmm7
> -       movaps  0x6c(%rsi), %xmm8
> -       movaps  0x7c(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $4, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $4, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $4, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $4, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $4, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $4, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $4, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_4)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4_bwd):
> -       movaps  -0x04(%rsi), %xmm1
> -
> -       movaps  -0x14(%rsi), %xmm2
> -       palignr $4, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x24(%rsi), %xmm3
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x34(%rsi), %xmm4
> -       palignr $4, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x44(%rsi), %xmm5
> -       palignr $4, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x54(%rsi), %xmm6
> -       palignr $4, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x64(%rsi), %xmm7
> -       palignr $4, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x74(%rsi), %xmm8
> -       palignr $4, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x84(%rsi), %xmm9
> -       palignr $4, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_4_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5):
> -       sub     $0x80, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       movaps  0x0b(%rsi), %xmm2
> -       movaps  0x1b(%rsi), %xmm3
> -       movaps  0x2b(%rsi), %xmm4
> -       movaps  0x3b(%rsi), %xmm5
> -       movaps  0x4b(%rsi), %xmm6
> -       movaps  0x5b(%rsi), %xmm7
> -       movaps  0x6b(%rsi), %xmm8
> -       movaps  0x7b(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $5, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $5, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $5, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $5, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $5, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $5, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $5, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_5)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5_bwd):
> -       movaps  -0x05(%rsi), %xmm1
> -
> -       movaps  -0x15(%rsi), %xmm2
> -       palignr $5, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x25(%rsi), %xmm3
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x35(%rsi), %xmm4
> -       palignr $5, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x45(%rsi), %xmm5
> -       palignr $5, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x55(%rsi), %xmm6
> -       palignr $5, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x65(%rsi), %xmm7
> -       palignr $5, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x75(%rsi), %xmm8
> -       palignr $5, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x85(%rsi), %xmm9
> -       palignr $5, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_5_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6):
> -       sub     $0x80, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       movaps  0x0a(%rsi), %xmm2
> -       movaps  0x1a(%rsi), %xmm3
> -       movaps  0x2a(%rsi), %xmm4
> -       movaps  0x3a(%rsi), %xmm5
> -       movaps  0x4a(%rsi), %xmm6
> -       movaps  0x5a(%rsi), %xmm7
> -       movaps  0x6a(%rsi), %xmm8
> -       movaps  0x7a(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $6, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $6, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $6, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $6, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $6, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $6, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $6, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_6)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6_bwd):
> -       movaps  -0x06(%rsi), %xmm1
> -
> -       movaps  -0x16(%rsi), %xmm2
> -       palignr $6, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x26(%rsi), %xmm3
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x36(%rsi), %xmm4
> -       palignr $6, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x46(%rsi), %xmm5
> -       palignr $6, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x56(%rsi), %xmm6
> -       palignr $6, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x66(%rsi), %xmm7
> -       palignr $6, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x76(%rsi), %xmm8
> -       palignr $6, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x86(%rsi), %xmm9
> -       palignr $6, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_6_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7):
> -       sub     $0x80, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       movaps  0x09(%rsi), %xmm2
> -       movaps  0x19(%rsi), %xmm3
> -       movaps  0x29(%rsi), %xmm4
> -       movaps  0x39(%rsi), %xmm5
> -       movaps  0x49(%rsi), %xmm6
> -       movaps  0x59(%rsi), %xmm7
> -       movaps  0x69(%rsi), %xmm8
> -       movaps  0x79(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $7, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $7, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $7, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $7, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $7, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $7, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $7, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_7)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7_bwd):
> -       movaps  -0x07(%rsi), %xmm1
> -
> -       movaps  -0x17(%rsi), %xmm2
> -       palignr $7, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x27(%rsi), %xmm3
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x37(%rsi), %xmm4
> -       palignr $7, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x47(%rsi), %xmm5
> -       palignr $7, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x57(%rsi), %xmm6
> -       palignr $7, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x67(%rsi), %xmm7
> -       palignr $7, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x77(%rsi), %xmm8
> -       palignr $7, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x87(%rsi), %xmm9
> -       palignr $7, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_7_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8):
> -       sub     $0x80, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       movaps  0x08(%rsi), %xmm2
> -       movaps  0x18(%rsi), %xmm3
> -       movaps  0x28(%rsi), %xmm4
> -       movaps  0x38(%rsi), %xmm5
> -       movaps  0x48(%rsi), %xmm6
> -       movaps  0x58(%rsi), %xmm7
> -       movaps  0x68(%rsi), %xmm8
> -       movaps  0x78(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $8, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $8, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $8, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $8, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $8, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $8, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $8, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_8)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8_bwd):
> -       movaps  -0x08(%rsi), %xmm1
> -
> -       movaps  -0x18(%rsi), %xmm2
> -       palignr $8, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x28(%rsi), %xmm3
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x38(%rsi), %xmm4
> -       palignr $8, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x48(%rsi), %xmm5
> -       palignr $8, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x58(%rsi), %xmm6
> -       palignr $8, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x68(%rsi), %xmm7
> -       palignr $8, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x78(%rsi), %xmm8
> -       palignr $8, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x88(%rsi), %xmm9
> -       palignr $8, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_8_bwd)
> -L(shl_8_end_bwd):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9):
> -       sub     $0x80, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       movaps  0x07(%rsi), %xmm2
> -       movaps  0x17(%rsi), %xmm3
> -       movaps  0x27(%rsi), %xmm4
> -       movaps  0x37(%rsi), %xmm5
> -       movaps  0x47(%rsi), %xmm6
> -       movaps  0x57(%rsi), %xmm7
> -       movaps  0x67(%rsi), %xmm8
> -       movaps  0x77(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $9, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $9, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $9, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $9, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $9, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $9, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $9, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_9)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9_bwd):
> -       movaps  -0x09(%rsi), %xmm1
> -
> -       movaps  -0x19(%rsi), %xmm2
> -       palignr $9, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x29(%rsi), %xmm3
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x39(%rsi), %xmm4
> -       palignr $9, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x49(%rsi), %xmm5
> -       palignr $9, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x59(%rsi), %xmm6
> -       palignr $9, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x69(%rsi), %xmm7
> -       palignr $9, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x79(%rsi), %xmm8
> -       palignr $9, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x89(%rsi), %xmm9
> -       palignr $9, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_9_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10):
> -       sub     $0x80, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       movaps  0x06(%rsi), %xmm2
> -       movaps  0x16(%rsi), %xmm3
> -       movaps  0x26(%rsi), %xmm4
> -       movaps  0x36(%rsi), %xmm5
> -       movaps  0x46(%rsi), %xmm6
> -       movaps  0x56(%rsi), %xmm7
> -       movaps  0x66(%rsi), %xmm8
> -       movaps  0x76(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $10, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $10, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $10, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $10, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $10, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $10, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $10, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_10)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10_bwd):
> -       movaps  -0x0a(%rsi), %xmm1
> -
> -       movaps  -0x1a(%rsi), %xmm2
> -       palignr $10, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2a(%rsi), %xmm3
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3a(%rsi), %xmm4
> -       palignr $10, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4a(%rsi), %xmm5
> -       palignr $10, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5a(%rsi), %xmm6
> -       palignr $10, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6a(%rsi), %xmm7
> -       palignr $10, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7a(%rsi), %xmm8
> -       palignr $10, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8a(%rsi), %xmm9
> -       palignr $10, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_10_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11):
> -       sub     $0x80, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       movaps  0x05(%rsi), %xmm2
> -       movaps  0x15(%rsi), %xmm3
> -       movaps  0x25(%rsi), %xmm4
> -       movaps  0x35(%rsi), %xmm5
> -       movaps  0x45(%rsi), %xmm6
> -       movaps  0x55(%rsi), %xmm7
> -       movaps  0x65(%rsi), %xmm8
> -       movaps  0x75(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $11, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $11, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $11, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $11, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $11, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $11, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $11, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_11)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11_bwd):
> -       movaps  -0x0b(%rsi), %xmm1
> -
> -       movaps  -0x1b(%rsi), %xmm2
> -       palignr $11, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2b(%rsi), %xmm3
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3b(%rsi), %xmm4
> -       palignr $11, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4b(%rsi), %xmm5
> -       palignr $11, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5b(%rsi), %xmm6
> -       palignr $11, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6b(%rsi), %xmm7
> -       palignr $11, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7b(%rsi), %xmm8
> -       palignr $11, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8b(%rsi), %xmm9
> -       palignr $11, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_11_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12):
> -       sub     $0x80, %rdx
> -       movdqa  -0x0c(%rsi), %xmm1
> -       movaps  0x04(%rsi), %xmm2
> -       movaps  0x14(%rsi), %xmm3
> -       movaps  0x24(%rsi), %xmm4
> -       movaps  0x34(%rsi), %xmm5
> -       movaps  0x44(%rsi), %xmm6
> -       movaps  0x54(%rsi), %xmm7
> -       movaps  0x64(%rsi), %xmm8
> -       movaps  0x74(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $12, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $12, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $12, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $12, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $12, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $12, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $12, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_12)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12_bwd):
> -       movaps  -0x0c(%rsi), %xmm1
> -
> -       movaps  -0x1c(%rsi), %xmm2
> -       palignr $12, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2c(%rsi), %xmm3
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3c(%rsi), %xmm4
> -       palignr $12, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4c(%rsi), %xmm5
> -       palignr $12, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5c(%rsi), %xmm6
> -       palignr $12, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6c(%rsi), %xmm7
> -       palignr $12, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7c(%rsi), %xmm8
> -       palignr $12, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8c(%rsi), %xmm9
> -       palignr $12, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_12_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13):
> -       sub     $0x80, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       movaps  0x03(%rsi), %xmm2
> -       movaps  0x13(%rsi), %xmm3
> -       movaps  0x23(%rsi), %xmm4
> -       movaps  0x33(%rsi), %xmm5
> -       movaps  0x43(%rsi), %xmm6
> -       movaps  0x53(%rsi), %xmm7
> -       movaps  0x63(%rsi), %xmm8
> -       movaps  0x73(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $13, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $13, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $13, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $13, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $13, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $13, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $13, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_13)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13_bwd):
> -       movaps  -0x0d(%rsi), %xmm1
> -
> -       movaps  -0x1d(%rsi), %xmm2
> -       palignr $13, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2d(%rsi), %xmm3
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3d(%rsi), %xmm4
> -       palignr $13, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4d(%rsi), %xmm5
> -       palignr $13, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5d(%rsi), %xmm6
> -       palignr $13, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6d(%rsi), %xmm7
> -       palignr $13, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7d(%rsi), %xmm8
> -       palignr $13, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8d(%rsi), %xmm9
> -       palignr $13, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_13_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14):
> -       sub     $0x80, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       movaps  0x02(%rsi), %xmm2
> -       movaps  0x12(%rsi), %xmm3
> -       movaps  0x22(%rsi), %xmm4
> -       movaps  0x32(%rsi), %xmm5
> -       movaps  0x42(%rsi), %xmm6
> -       movaps  0x52(%rsi), %xmm7
> -       movaps  0x62(%rsi), %xmm8
> -       movaps  0x72(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $14, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $14, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $14, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $14, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $14, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $14, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $14, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_14)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14_bwd):
> -       movaps  -0x0e(%rsi), %xmm1
> -
> -       movaps  -0x1e(%rsi), %xmm2
> -       palignr $14, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2e(%rsi), %xmm3
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3e(%rsi), %xmm4
> -       palignr $14, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4e(%rsi), %xmm5
> -       palignr $14, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5e(%rsi), %xmm6
> -       palignr $14, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6e(%rsi), %xmm7
> -       palignr $14, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7e(%rsi), %xmm8
> -       palignr $14, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8e(%rsi), %xmm9
> -       palignr $14, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_14_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15):
> -       sub     $0x80, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       movaps  0x01(%rsi), %xmm2
> -       movaps  0x11(%rsi), %xmm3
> -       movaps  0x21(%rsi), %xmm4
> -       movaps  0x31(%rsi), %xmm5
> -       movaps  0x41(%rsi), %xmm6
> -       movaps  0x51(%rsi), %xmm7
> -       movaps  0x61(%rsi), %xmm8
> -       movaps  0x71(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $15, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $15, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $15, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $15, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $15, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $15, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $15, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_15)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15_bwd):
> -       movaps  -0x0f(%rsi), %xmm1
> -
> -       movaps  -0x1f(%rsi), %xmm2
> -       palignr $15, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2f(%rsi), %xmm3
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3f(%rsi), %xmm4
> -       palignr $15, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4f(%rsi), %xmm5
> -       palignr $15, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5f(%rsi), %xmm6
> -       palignr $15, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6f(%rsi), %xmm7
> -       palignr $15, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7f(%rsi), %xmm8
> -       palignr $15, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8f(%rsi), %xmm9
> -       palignr $15, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_15_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_fwd):
> -       movdqu  (%rsi), %xmm1
> -       movdqu  %xmm0, (%r8)
> -       movdqa  %xmm1, (%rdi)
> -       sub     $16, %rdx
> -       add     $16, %rsi
> -       add     $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rsi, %r9
> -       sub     %rdi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_fwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger_in_fwd)
> -       mov     %rdx, %rcx
> -L(bigger_in_fwd):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy_fwd)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy_fwd)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy_fwd):
> -       sub     $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 0x200(%rsi)
> -       prefetcht0 0x300(%rsi)
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lfence
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       movntdq %xmm4, 0x40(%rdi)
> -       movntdq %xmm5, 0x50(%rdi)
> -       movntdq %xmm6, 0x60(%rdi)
> -       movntdq %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_mem_fwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_fwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy_fwd):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> -       prefetchnta 0x1c0(%rsi)
> -       prefetchnta 0x280(%rsi)
> -       prefetchnta 0x1c0(%rdi)
> -       prefetchnta 0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       movdqa  %xmm2, 0x20(%rdi)
> -       movdqa  %xmm3, 0x30(%rdi)
> -       movdqa  %xmm4, 0x40(%rdi)
> -       movdqa  %xmm5, 0x50(%rdi)
> -       movdqa  %xmm6, 0x60(%rdi)
> -       movdqa  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_bwd):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $-16, %rdi
> -       sub     %rdi, %r9
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rdi, %r9
> -       sub     %rsi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_bwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger)
> -       mov     %rdx, %rcx
> -L(bigger):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy):
> -       sub     $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 -0x200(%rsi)
> -       prefetcht0 -0x300(%rsi)
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       lfence
> -       movntdq %xmm1, -0x10(%rdi)
> -       movntdq %xmm2, -0x20(%rdi)
> -       movntdq %xmm3, -0x30(%rdi)
> -       movntdq %xmm4, -0x40(%rdi)
> -       movntdq %xmm5, -0x50(%rdi)
> -       movntdq %xmm6, -0x60(%rdi)
> -       movntdq %xmm7, -0x70(%rdi)
> -       movntdq %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_mem_bwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_bwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop):
> -       prefetchnta -0x1c0(%rsi)
> -       prefetchnta -0x280(%rsi)
> -       prefetchnta -0x1c0(%rdi)
> -       prefetchnta -0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       movdqa  %xmm1, -0x10(%rdi)
> -       movdqa  %xmm2, -0x20(%rdi)
> -       movdqa  %xmm3, -0x30(%rdi)
> -       movdqa  %xmm4, -0x40(%rdi)
> -       movdqa  %xmm5, -0x50(%rdi)
> -       movdqa  %xmm6, -0x60(%rdi)
> -       movdqa  %xmm7, -0x70(%rdi)
> -       movdqa  %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rsi
> -       sub     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(fwd_write_128bytes):
> -       lddqu   -128(%rsi), %xmm0
> -       movdqu  %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> -       lddqu   -112(%rsi), %xmm0
> -       movdqu  %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> -       lddqu   -96(%rsi), %xmm0
> -       movdqu  %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> -       lddqu   -80(%rsi), %xmm0
> -       movdqu  %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> -       lddqu   -64(%rsi), %xmm0
> -       movdqu  %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> -       lddqu   -48(%rsi), %xmm0
> -       movdqu  %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> -       lddqu   -32(%rsi), %xmm0
> -       movdqu  %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> -       lddqu   -16(%rsi), %xmm0
> -       movdqu  %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> -       ret
> -
> -
> -       .p2align 4
> -L(fwd_write_143bytes):
> -       lddqu   -143(%rsi), %xmm0
> -       movdqu  %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> -       lddqu   -127(%rsi), %xmm0
> -       movdqu  %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> -       lddqu   -111(%rsi), %xmm0
> -       movdqu  %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> -       lddqu   -95(%rsi), %xmm0
> -       movdqu  %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> -       lddqu   -79(%rsi), %xmm0
> -       movdqu  %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> -       lddqu   -63(%rsi), %xmm0
> -       movdqu  %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> -       lddqu   -47(%rsi), %xmm0
> -       movdqu  %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> -       lddqu   -31(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -31(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_15bytes):
> -       mov     -15(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -15(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_142bytes):
> -       lddqu   -142(%rsi), %xmm0
> -       movdqu  %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> -       lddqu   -126(%rsi), %xmm0
> -       movdqu  %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> -       lddqu   -110(%rsi), %xmm0
> -       movdqu  %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> -       lddqu   -94(%rsi), %xmm0
> -       movdqu  %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> -       lddqu   -78(%rsi), %xmm0
> -       movdqu  %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> -       lddqu   -62(%rsi), %xmm0
> -       movdqu  %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> -       lddqu   -46(%rsi), %xmm0
> -       movdqu  %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> -       lddqu   -30(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -30(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_14bytes):
> -       mov     -14(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -14(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_141bytes):
> -       lddqu   -141(%rsi), %xmm0
> -       movdqu  %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> -       lddqu   -125(%rsi), %xmm0
> -       movdqu  %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> -       lddqu   -109(%rsi), %xmm0
> -       movdqu  %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> -       lddqu   -93(%rsi), %xmm0
> -       movdqu  %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> -       lddqu   -77(%rsi), %xmm0
> -       movdqu  %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> -       lddqu   -61(%rsi), %xmm0
> -       movdqu  %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> -       lddqu   -45(%rsi), %xmm0
> -       movdqu  %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> -       lddqu   -29(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -29(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_13bytes):
> -       mov     -13(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -13(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_140bytes):
> -       lddqu   -140(%rsi), %xmm0
> -       movdqu  %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> -       lddqu   -124(%rsi), %xmm0
> -       movdqu  %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> -       lddqu   -108(%rsi), %xmm0
> -       movdqu  %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> -       lddqu   -92(%rsi), %xmm0
> -       movdqu  %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> -       lddqu   -76(%rsi), %xmm0
> -       movdqu  %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> -       lddqu   -60(%rsi), %xmm0
> -       movdqu  %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> -       lddqu   -44(%rsi), %xmm0
> -       movdqu  %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> -       lddqu   -28(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -28(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_12bytes):
> -       mov     -12(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -12(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_139bytes):
> -       lddqu   -139(%rsi), %xmm0
> -       movdqu  %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> -       lddqu   -123(%rsi), %xmm0
> -       movdqu  %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> -       lddqu   -107(%rsi), %xmm0
> -       movdqu  %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> -       lddqu   -91(%rsi), %xmm0
> -       movdqu  %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> -       lddqu   -75(%rsi), %xmm0
> -       movdqu  %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> -       lddqu   -59(%rsi), %xmm0
> -       movdqu  %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> -       lddqu   -43(%rsi), %xmm0
> -       movdqu  %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> -       lddqu   -27(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -27(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_11bytes):
> -       mov     -11(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -11(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_138bytes):
> -       lddqu   -138(%rsi), %xmm0
> -       movdqu  %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> -       lddqu   -122(%rsi), %xmm0
> -       movdqu  %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> -       lddqu   -106(%rsi), %xmm0
> -       movdqu  %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> -       lddqu   -90(%rsi), %xmm0
> -       movdqu  %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> -       lddqu   -74(%rsi), %xmm0
> -       movdqu  %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> -       lddqu   -58(%rsi), %xmm0
> -       movdqu  %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> -       lddqu   -42(%rsi), %xmm0
> -       movdqu  %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> -       lddqu   -26(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -26(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_10bytes):
> -       mov     -10(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -10(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_137bytes):
> -       lddqu   -137(%rsi), %xmm0
> -       movdqu  %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> -       lddqu   -121(%rsi), %xmm0
> -       movdqu  %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> -       lddqu   -105(%rsi), %xmm0
> -       movdqu  %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> -       lddqu   -89(%rsi), %xmm0
> -       movdqu  %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> -       lddqu   -73(%rsi), %xmm0
> -       movdqu  %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> -       lddqu   -57(%rsi), %xmm0
> -       movdqu  %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> -       lddqu   -41(%rsi), %xmm0
> -       movdqu  %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> -       lddqu   -25(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -25(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_9bytes):
> -       mov     -9(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -9(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_136bytes):
> -       lddqu   -136(%rsi), %xmm0
> -       movdqu  %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> -       lddqu   -120(%rsi), %xmm0
> -       movdqu  %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> -       lddqu   -104(%rsi), %xmm0
> -       movdqu  %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> -       lddqu   -88(%rsi), %xmm0
> -       movdqu  %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> -       lddqu   -72(%rsi), %xmm0
> -       movdqu  %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> -       lddqu   -56(%rsi), %xmm0
> -       movdqu  %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> -       lddqu   -40(%rsi), %xmm0
> -       movdqu  %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> -       lddqu   -24(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -24(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_8bytes):
> -       mov     -8(%rsi), %rdx
> -       mov     %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_135bytes):
> -       lddqu   -135(%rsi), %xmm0
> -       movdqu  %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> -       lddqu   -119(%rsi), %xmm0
> -       movdqu  %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> -       lddqu   -103(%rsi), %xmm0
> -       movdqu  %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> -       lddqu   -87(%rsi), %xmm0
> -       movdqu  %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> -       lddqu   -71(%rsi), %xmm0
> -       movdqu  %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> -       lddqu   -55(%rsi), %xmm0
> -       movdqu  %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> -       lddqu   -39(%rsi), %xmm0
> -       movdqu  %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> -       lddqu   -23(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -23(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_7bytes):
> -       mov     -7(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -7(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_134bytes):
> -       lddqu   -134(%rsi), %xmm0
> -       movdqu  %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> -       lddqu   -118(%rsi), %xmm0
> -       movdqu  %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> -       lddqu   -102(%rsi), %xmm0
> -       movdqu  %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> -       lddqu   -86(%rsi), %xmm0
> -       movdqu  %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> -       lddqu   -70(%rsi), %xmm0
> -       movdqu  %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> -       lddqu   -54(%rsi), %xmm0
> -       movdqu  %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> -       lddqu   -38(%rsi), %xmm0
> -       movdqu  %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> -       lddqu   -22(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -22(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_6bytes):
> -       mov     -6(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -6(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_133bytes):
> -       lddqu   -133(%rsi), %xmm0
> -       movdqu  %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> -       lddqu   -117(%rsi), %xmm0
> -       movdqu  %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> -       lddqu   -101(%rsi), %xmm0
> -       movdqu  %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> -       lddqu   -85(%rsi), %xmm0
> -       movdqu  %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> -       lddqu   -69(%rsi), %xmm0
> -       movdqu  %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> -       lddqu   -53(%rsi), %xmm0
> -       movdqu  %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> -       lddqu   -37(%rsi), %xmm0
> -       movdqu  %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> -       lddqu   -21(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -21(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_5bytes):
> -       mov     -5(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -5(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_132bytes):
> -       lddqu   -132(%rsi), %xmm0
> -       movdqu  %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> -       lddqu   -116(%rsi), %xmm0
> -       movdqu  %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> -       lddqu   -100(%rsi), %xmm0
> -       movdqu  %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> -       lddqu   -84(%rsi), %xmm0
> -       movdqu  %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> -       lddqu   -68(%rsi), %xmm0
> -       movdqu  %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> -       lddqu   -52(%rsi), %xmm0
> -       movdqu  %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> -       lddqu   -36(%rsi), %xmm0
> -       movdqu  %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> -       lddqu   -20(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -20(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_4bytes):
> -       mov     -4(%rsi), %edx
> -       mov     %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_131bytes):
> -       lddqu   -131(%rsi), %xmm0
> -       movdqu  %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> -       lddqu   -115(%rsi), %xmm0
> -       movdqu  %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> -       lddqu   -99(%rsi), %xmm0
> -       movdqu  %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> -       lddqu   -83(%rsi), %xmm0
> -       movdqu  %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> -       lddqu   -67(%rsi), %xmm0
> -       movdqu  %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> -       lddqu   -51(%rsi), %xmm0
> -       movdqu  %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> -       lddqu   -35(%rsi), %xmm0
> -       movdqu  %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> -       lddqu   -19(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -19(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_3bytes):
> -       mov     -3(%rsi), %dx
> -       mov     -2(%rsi), %cx
> -       mov     %dx, -3(%rdi)
> -       mov     %cx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_130bytes):
> -       lddqu   -130(%rsi), %xmm0
> -       movdqu  %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> -       lddqu   -114(%rsi), %xmm0
> -       movdqu  %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> -       lddqu   -98(%rsi), %xmm0
> -       movdqu  %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> -       lddqu   -82(%rsi), %xmm0
> -       movdqu  %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> -       lddqu   -66(%rsi), %xmm0
> -       movdqu  %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> -       lddqu   -50(%rsi), %xmm0
> -       movdqu  %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> -       lddqu   -34(%rsi), %xmm0
> -       movdqu  %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> -       lddqu   -18(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -18(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_2bytes):
> -       movzwl  -2(%rsi), %edx
> -       mov     %dx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_129bytes):
> -       lddqu   -129(%rsi), %xmm0
> -       movdqu  %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> -       lddqu   -113(%rsi), %xmm0
> -       movdqu  %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> -       lddqu   -97(%rsi), %xmm0
> -       movdqu  %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> -       lddqu   -81(%rsi), %xmm0
> -       movdqu  %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> -       lddqu   -65(%rsi), %xmm0
> -       movdqu  %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> -       lddqu   -49(%rsi), %xmm0
> -       movdqu  %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> -       lddqu   -33(%rsi), %xmm0
> -       movdqu  %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> -       lddqu   -17(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -17(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_1bytes):
> -       movzbl  -1(%rsi), %edx
> -       mov     %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_128bytes):
> -       lddqu   112(%rsi), %xmm0
> -       movdqu  %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> -       lddqu   96(%rsi), %xmm0
> -       movdqu  %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> -       lddqu   80(%rsi), %xmm0
> -       movdqu  %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> -       lddqu   64(%rsi), %xmm0
> -       movdqu  %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> -       lddqu   48(%rsi), %xmm0
> -       movdqu  %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> -       lddqu   32(%rsi), %xmm0
> -       movdqu  %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> -       lddqu   16(%rsi), %xmm0
> -       movdqu  %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> -       lddqu   (%rsi), %xmm0
> -       movdqu  %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_143bytes):
> -       lddqu   127(%rsi), %xmm0
> -       movdqu  %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> -       lddqu   111(%rsi), %xmm0
> -       movdqu  %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> -       lddqu   95(%rsi), %xmm0
> -       movdqu  %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> -       lddqu   79(%rsi), %xmm0
> -       movdqu  %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> -       lddqu   63(%rsi), %xmm0
> -       movdqu  %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> -       lddqu   47(%rsi), %xmm0
> -       movdqu  %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> -       lddqu   31(%rsi), %xmm0
> -       movdqu  %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> -       lddqu   15(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 15(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -
> -       .p2align 4
> -L(bwd_write_15bytes):
> -       mov     7(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 7(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_142bytes):
> -       lddqu   126(%rsi), %xmm0
> -       movdqu  %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> -       lddqu   110(%rsi), %xmm0
> -       movdqu  %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> -       lddqu   94(%rsi), %xmm0
> -       movdqu  %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> -       lddqu   78(%rsi), %xmm0
> -       movdqu  %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> -       lddqu   62(%rsi), %xmm0
> -       movdqu  %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> -       lddqu   46(%rsi), %xmm0
> -       movdqu  %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> -       lddqu   30(%rsi), %xmm0
> -       movdqu  %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> -       lddqu   14(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 14(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_14bytes):
> -       mov     6(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 6(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_141bytes):
> -       lddqu   125(%rsi), %xmm0
> -       movdqu  %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> -       lddqu   109(%rsi), %xmm0
> -       movdqu  %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> -       lddqu   93(%rsi), %xmm0
> -       movdqu  %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> -       lddqu   77(%rsi), %xmm0
> -       movdqu  %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> -       lddqu   61(%rsi), %xmm0
> -       movdqu  %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> -       lddqu   45(%rsi), %xmm0
> -       movdqu  %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> -       lddqu   29(%rsi), %xmm0
> -       movdqu  %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> -       lddqu   13(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 13(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_13bytes):
> -       mov     5(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 5(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_140bytes):
> -       lddqu   124(%rsi), %xmm0
> -       movdqu  %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> -       lddqu   108(%rsi), %xmm0
> -       movdqu  %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> -       lddqu   92(%rsi), %xmm0
> -       movdqu  %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> -       lddqu   76(%rsi), %xmm0
> -       movdqu  %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> -       lddqu   60(%rsi), %xmm0
> -       movdqu  %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> -       lddqu   44(%rsi), %xmm0
> -       movdqu  %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> -       lddqu   28(%rsi), %xmm0
> -       movdqu  %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> -       lddqu   12(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 12(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_12bytes):
> -       mov     4(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 4(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_139bytes):
> -       lddqu   123(%rsi), %xmm0
> -       movdqu  %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> -       lddqu   107(%rsi), %xmm0
> -       movdqu  %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> -       lddqu   91(%rsi), %xmm0
> -       movdqu  %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> -       lddqu   75(%rsi), %xmm0
> -       movdqu  %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> -       lddqu   59(%rsi), %xmm0
> -       movdqu  %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> -       lddqu   43(%rsi), %xmm0
> -       movdqu  %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> -       lddqu   27(%rsi), %xmm0
> -       movdqu  %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> -       lddqu   11(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 11(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_11bytes):
> -       mov     3(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 3(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_138bytes):
> -       lddqu   122(%rsi), %xmm0
> -       movdqu  %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> -       lddqu   106(%rsi), %xmm0
> -       movdqu  %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> -       lddqu   90(%rsi), %xmm0
> -       movdqu  %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> -       lddqu   74(%rsi), %xmm0
> -       movdqu  %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> -       lddqu   58(%rsi), %xmm0
> -       movdqu  %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> -       lddqu   42(%rsi), %xmm0
> -       movdqu  %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> -       lddqu   26(%rsi), %xmm0
> -       movdqu  %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> -       lddqu   10(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 10(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_10bytes):
> -       mov     2(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 2(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_137bytes):
> -       lddqu   121(%rsi), %xmm0
> -       movdqu  %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> -       lddqu   105(%rsi), %xmm0
> -       movdqu  %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> -       lddqu   89(%rsi), %xmm0
> -       movdqu  %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> -       lddqu   73(%rsi), %xmm0
> -       movdqu  %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> -       lddqu   57(%rsi), %xmm0
> -       movdqu  %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> -       lddqu   41(%rsi), %xmm0
> -       movdqu  %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> -       lddqu   25(%rsi), %xmm0
> -       movdqu  %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> -       lddqu   9(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 9(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_9bytes):
> -       mov     1(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 1(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_136bytes):
> -       lddqu   120(%rsi), %xmm0
> -       movdqu  %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> -       lddqu   104(%rsi), %xmm0
> -       movdqu  %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> -       lddqu   88(%rsi), %xmm0
> -       movdqu  %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> -       lddqu   72(%rsi), %xmm0
> -       movdqu  %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> -       lddqu   56(%rsi), %xmm0
> -       movdqu  %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> -       lddqu   40(%rsi), %xmm0
> -       movdqu  %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> -       lddqu   24(%rsi), %xmm0
> -       movdqu  %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> -       lddqu   8(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 8(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_8bytes):
> -       mov     (%rsi), %rdx
> -       mov     %rdx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_135bytes):
> -       lddqu   119(%rsi), %xmm0
> -       movdqu  %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> -       lddqu   103(%rsi), %xmm0
> -       movdqu  %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> -       lddqu   87(%rsi), %xmm0
> -       movdqu  %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> -       lddqu   71(%rsi), %xmm0
> -       movdqu  %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> -       lddqu   55(%rsi), %xmm0
> -       movdqu  %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> -       lddqu   39(%rsi), %xmm0
> -       movdqu  %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> -       lddqu   23(%rsi), %xmm0
> -       movdqu  %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> -       lddqu   7(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 7(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_7bytes):
> -       mov     3(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 3(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_134bytes):
> -       lddqu   118(%rsi), %xmm0
> -       movdqu  %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> -       lddqu   102(%rsi), %xmm0
> -       movdqu  %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> -       lddqu   86(%rsi), %xmm0
> -       movdqu  %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> -       lddqu   70(%rsi), %xmm0
> -       movdqu  %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> -       lddqu   54(%rsi), %xmm0
> -       movdqu  %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> -       lddqu   38(%rsi), %xmm0
> -       movdqu  %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> -       lddqu   22(%rsi), %xmm0
> -       movdqu  %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> -       lddqu   6(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 6(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_6bytes):
> -       mov     2(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 2(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_133bytes):
> -       lddqu   117(%rsi), %xmm0
> -       movdqu  %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> -       lddqu   101(%rsi), %xmm0
> -       movdqu  %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> -       lddqu   85(%rsi), %xmm0
> -       movdqu  %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> -       lddqu   69(%rsi), %xmm0
> -       movdqu  %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> -       lddqu   53(%rsi), %xmm0
> -       movdqu  %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> -       lddqu   37(%rsi), %xmm0
> -       movdqu  %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> -       lddqu   21(%rsi), %xmm0
> -       movdqu  %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> -       lddqu   5(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 5(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_5bytes):
> -       mov     1(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 1(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_132bytes):
> -       lddqu   116(%rsi), %xmm0
> -       movdqu  %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> -       lddqu   100(%rsi), %xmm0
> -       movdqu  %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> -       lddqu   84(%rsi), %xmm0
> -       movdqu  %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> -       lddqu   68(%rsi), %xmm0
> -       movdqu  %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> -       lddqu   52(%rsi), %xmm0
> -       movdqu  %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> -       lddqu   36(%rsi), %xmm0
> -       movdqu  %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> -       lddqu   20(%rsi), %xmm0
> -       movdqu  %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> -       lddqu   4(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 4(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_4bytes):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_131bytes):
> -       lddqu   115(%rsi), %xmm0
> -       movdqu  %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> -       lddqu   99(%rsi), %xmm0
> -       movdqu  %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> -       lddqu   83(%rsi), %xmm0
> -       movdqu  %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> -       lddqu   67(%rsi), %xmm0
> -       movdqu  %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> -       lddqu   51(%rsi), %xmm0
> -       movdqu  %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> -       lddqu   35(%rsi), %xmm0
> -       movdqu  %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> -       lddqu   19(%rsi), %xmm0
> -       movdqu  %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> -       lddqu   3(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 3(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_3bytes):
> -       mov     1(%rsi), %dx
> -       mov     (%rsi), %cx
> -       mov     %dx, 1(%rdi)
> -       mov     %cx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_130bytes):
> -       lddqu   114(%rsi), %xmm0
> -       movdqu  %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> -       lddqu   98(%rsi), %xmm0
> -       movdqu  %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> -       lddqu   82(%rsi), %xmm0
> -       movdqu  %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> -       lddqu   66(%rsi), %xmm0
> -       movdqu  %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> -       lddqu   50(%rsi), %xmm0
> -       movdqu  %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> -       lddqu   34(%rsi), %xmm0
> -       movdqu  %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> -       lddqu   18(%rsi), %xmm0
> -       movdqu  %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> -       lddqu   2(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 2(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_2bytes):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_129bytes):
> -       lddqu   113(%rsi), %xmm0
> -       movdqu  %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> -       lddqu   97(%rsi), %xmm0
> -       movdqu  %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> -       lddqu   81(%rsi), %xmm0
> -       movdqu  %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> -       lddqu   65(%rsi), %xmm0
> -       movdqu  %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> -       lddqu   49(%rsi), %xmm0
> -       movdqu  %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> -       lddqu   33(%rsi), %xmm0
> -       movdqu  %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> -       lddqu   17(%rsi), %xmm0
> -       movdqu  %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> -       lddqu   1(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 1(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_1bytes):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> -       ret
> -
> -END (MEMCPY)
> -
> -       .section .rodata.ssse3,"a",@progbits
> -       .p2align 3
> -L(table_144_bytes_bwd):
> -       .int    JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> -       .p2align 3
> -L(table_144_bytes_fwd):
> -       .int    JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> -       .p2align 3
> -L(shl_table_fwd):
> -       .int    JMPTBL (L(shl_0), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_1), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_2), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_3), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_4), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_5), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_6), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_7), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_8), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_9), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_10), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_11), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_12), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_13), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_14), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> -       .p2align 3
> -L(shl_table_bwd):
> -       .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif

> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY         __memmove_ssse3_back
> -#define MEMCPY_CHK     __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (3 preceding siblings ...)
  2022-04-14 16:47     ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-04-14 16:47     ` Noah Goldstein
  2022-04-14 18:04     ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
  5 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-04-14 16:47 UTC (permalink / raw)
  To: libc-alpha

The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
generally preferable. memcpy/memmove is one exception where avoiding
unaligned loads with `palignr` is important for some targets.

This commit replaces memmove-ssse3 with a better optimized are lower
code footprint verion. As well it aliases memcpy to memmove.

Aside from this function all other SSSE3 functions should be safe to
remove.

The performance is not changed drastically although shows overall
improvements without any major regressions or gains.

bench-memcpy geometric_mean(N=50) New / Original: 0.957

bench-memcpy-random geometric_mean(N=50) New / Original: 0.912

bench-memcpy-large geometric_mean(N=50) New / Original: 0.892

Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
for all results.

More important this saves 7246 bytes of code size in memmove an
additional 10741 bytes by reusing memmove code for memcpy (total 17987
bytes saves). As well an additional 896 bytes of rodata for the jump
table entries.
---

Results For: bench-memcpy
length, align1, align2, dst > src, New Time / Old Time
     1,      0,      0,         0,               0.946
     1,      0,      0,         1,               0.946
     1,     32,      0,         0,               0.948
     1,     32,      0,         1,               1.185
     1,      0,     32,         0,               0.982
     1,      0,     32,         1,                1.14
     1,     32,     32,         0,               0.981
     1,     32,     32,         1,               1.057
     1,   2048,      0,         0,               0.945
     1,   2048,      0,         1,               0.945
     2,      0,      0,         0,               1.041
     2,      0,      0,         1,               1.041
     2,      1,      0,         0,               1.044
     2,      1,      0,         1,               1.044
     2,     33,      0,         0,               1.044
     2,     33,      0,         1,               1.044
     2,      0,      1,         0,               1.041
     2,      0,      1,         1,               1.041
     2,      0,     33,         0,               1.042
     2,      0,     33,         1,               1.041
     2,      1,      1,         0,               1.041
     2,      1,      1,         1,               1.041
     2,     33,     33,         0,               1.041
     2,     33,     33,         1,               1.041
     2,   2048,      0,         0,               1.042
     2,   2048,      0,         1,               1.041
     2,   2049,      0,         0,               1.044
     2,   2049,      0,         1,               1.044
     2,   2048,      1,         0,               1.041
     2,   2048,      1,         1,               1.042
     2,   2049,      1,         0,               1.042
     2,   2049,      1,         1,               1.042
     4,      0,      0,         0,               0.962
     4,      0,      0,         1,               0.962
     4,      2,      0,         0,                0.98
     4,      2,      0,         1,               0.984
     4,     34,      0,         0,               0.986
     4,     34,      0,         1,               0.987
     4,      0,      2,         0,               0.962
     4,      0,      2,         1,               0.962
     4,      0,     34,         0,               0.962
     4,      0,     34,         1,               0.962
     4,      2,      2,         0,               0.962
     4,      2,      2,         1,               0.962
     4,     34,     34,         0,               0.962
     4,     34,     34,         1,               0.962
     4,   2048,      0,         0,               0.962
     4,   2048,      0,         1,               0.962
     4,   2050,      0,         0,               0.996
     4,   2050,      0,         1,                 1.0
     4,   2048,      2,         0,               0.962
     4,   2048,      2,         1,               0.962
     4,   2050,      2,         0,               0.962
     4,   2050,      2,         1,               0.962
     8,      0,      0,         0,               0.962
     8,      0,      0,         1,               0.962
     8,      3,      0,         0,                 1.0
     8,      3,      0,         1,                 1.0
     8,     35,      0,         0,               1.001
     8,     35,      0,         1,                 1.0
     8,      0,      3,         0,               0.962
     8,      0,      3,         1,               0.962
     8,      0,     35,         0,               0.962
     8,      0,     35,         1,               0.962
     8,      3,      3,         0,               0.962
     8,      3,      3,         1,               0.962
     8,     35,     35,         0,               0.962
     8,     35,     35,         1,               0.962
     8,   2048,      0,         0,               0.962
     8,   2048,      0,         1,               0.962
     8,   2051,      0,         0,                 1.0
     8,   2051,      0,         1,                 1.0
     8,   2048,      3,         0,               0.962
     8,   2048,      3,         1,               0.962
     8,   2051,      3,         0,               0.962
     8,   2051,      3,         1,               0.962
    16,      0,      0,         0,               0.798
    16,      0,      0,         1,               0.799
    16,      4,      0,         0,               0.801
    16,      4,      0,         1,               0.801
    16,     36,      0,         0,               0.801
    16,     36,      0,         1,               0.801
    16,      0,      4,         0,               0.798
    16,      0,      4,         1,               0.799
    16,      0,     36,         0,               0.799
    16,      0,     36,         1,               0.799
    16,      4,      4,         0,               0.799
    16,      4,      4,         1,               0.799
    16,     36,     36,         0,               0.799
    16,     36,     36,         1,               0.799
    16,   2048,      0,         0,               0.799
    16,   2048,      0,         1,               0.799
    16,   2052,      0,         0,               0.801
    16,   2052,      0,         1,               0.801
    16,   2048,      4,         0,               0.798
    16,   2048,      4,         1,               0.799
    16,   2052,      4,         0,               0.799
    16,   2052,      4,         1,               0.799
    32,      0,      0,         0,               0.472
    32,      0,      0,         1,               0.472
    32,      5,      0,         0,               0.472
    32,      5,      0,         1,               0.472
    32,     37,      0,         0,               0.962
    32,     37,      0,         1,               0.962
    32,      0,      5,         0,               0.472
    32,      0,      5,         1,               0.472
    32,      0,     37,         0,               1.021
    32,      0,     37,         1,               1.021
    32,      5,      5,         0,               0.472
    32,      5,      5,         1,               0.472
    32,     37,     37,         0,               1.011
    32,     37,     37,         1,               1.011
    32,   2048,      0,         0,               0.472
    32,   2048,      0,         1,               0.472
    32,   2053,      0,         0,               0.472
    32,   2053,      0,         1,               0.472
    32,   2048,      5,         0,               0.472
    32,   2048,      5,         1,               0.472
    32,   2053,      5,         0,               0.472
    32,   2053,      5,         1,               0.472
    64,      0,      0,         0,                 1.0
    64,      0,      0,         1,                 1.0
    64,      6,      0,         0,               0.862
    64,      6,      0,         1,               0.862
    64,     38,      0,         0,               0.912
    64,     38,      0,         1,               0.912
    64,      0,      6,         0,               0.896
    64,      0,      6,         1,               0.896
    64,      0,     38,         0,               0.906
    64,      0,     38,         1,               0.906
    64,      6,      6,         0,                0.91
    64,      6,      6,         1,                0.91
    64,     38,     38,         0,               0.883
    64,     38,     38,         1,               0.883
    64,   2048,      0,         0,                 1.0
    64,   2048,      0,         1,                 1.0
    64,   2054,      0,         0,               0.862
    64,   2054,      0,         1,               0.862
    64,   2048,      6,         0,               0.887
    64,   2048,      6,         1,               0.887
    64,   2054,      6,         0,               0.887
    64,   2054,      6,         1,               0.887
   128,      0,      0,         0,               0.857
   128,      0,      0,         1,               0.857
   128,      7,      0,         0,               0.875
   128,      7,      0,         1,               0.875
   128,     39,      0,         0,               0.892
   128,     39,      0,         1,               0.892
   128,      0,      7,         0,               1.183
   128,      0,      7,         1,               1.183
   128,      0,     39,         0,               1.113
   128,      0,     39,         1,               1.113
   128,      7,      7,         0,               0.692
   128,      7,      7,         1,               0.692
   128,     39,     39,         0,               1.104
   128,     39,     39,         1,               1.104
   128,   2048,      0,         0,               0.857
   128,   2048,      0,         1,               0.857
   128,   2055,      0,         0,               0.875
   128,   2055,      0,         1,               0.875
   128,   2048,      7,         0,               0.959
   128,   2048,      7,         1,               0.959
   128,   2055,      7,         0,               1.036
   128,   2055,      7,         1,               1.036
   256,      0,      0,         0,               0.889
   256,      0,      0,         1,               0.889
   256,      8,      0,         0,               0.966
   256,      8,      0,         1,               0.966
   256,     40,      0,         0,               0.983
   256,     40,      0,         1,               0.983
   256,      0,      8,         0,                1.29
   256,      0,      8,         1,                1.29
   256,      0,     40,         0,               1.274
   256,      0,     40,         1,               1.274
   256,      8,      8,         0,               0.865
   256,      8,      8,         1,               0.865
   256,     40,     40,         0,               1.477
   256,     40,     40,         1,               1.477
   256,   2048,      0,         0,               0.889
   256,   2048,      0,         1,               0.889
   256,   2056,      0,         0,               0.966
   256,   2056,      0,         1,               0.966
   256,   2048,      8,         0,               0.952
   256,   2048,      8,         1,               0.952
   256,   2056,      8,         0,               0.878
   256,   2056,      8,         1,               0.878
   512,      0,      0,         0,               1.077
   512,      0,      0,         1,               1.077
   512,      9,      0,         0,                 1.0
   512,      9,      0,         1,                 1.0
   512,     41,      0,         0,               0.954
   512,     41,      0,         1,               0.954
   512,      0,      9,         0,               1.191
   512,      0,      9,         1,               1.191
   512,      0,     41,         0,               1.181
   512,      0,     41,         1,               1.181
   512,      9,      9,         0,               0.765
   512,      9,      9,         1,               0.765
   512,     41,     41,         0,               0.905
   512,     41,     41,         1,               0.905
   512,   2048,      0,         0,               1.077
   512,   2048,      0,         1,               1.077
   512,   2057,      0,         0,                 1.0
   512,   2057,      0,         1,                 1.0
   512,   2048,      9,         0,                 1.0
   512,   2048,      9,         1,                 1.0
   512,   2057,      9,         0,               0.733
   512,   2057,      9,         1,               0.733
  1024,      0,      0,         0,               1.143
  1024,      0,      0,         1,               1.143
  1024,     10,      0,         0,               1.015
  1024,     10,      0,         1,               1.015
  1024,     42,      0,         0,               1.045
  1024,     42,      0,         1,               1.045
  1024,      0,     10,         0,               1.126
  1024,      0,     10,         1,               1.126
  1024,      0,     42,         0,               1.114
  1024,      0,     42,         1,               1.114
  1024,     10,     10,         0,                0.89
  1024,     10,     10,         1,                0.89
  1024,     42,     42,         0,               0.986
  1024,     42,     42,         1,               0.986
  1024,   2048,      0,         0,               1.143
  1024,   2048,      0,         1,               1.143
  1024,   2058,      0,         0,               1.015
  1024,   2058,      0,         1,               1.015
  1024,   2048,     10,         0,                1.03
  1024,   2048,     10,         1,                1.03
  1024,   2058,     10,         0,               0.854
  1024,   2058,     10,         1,               0.854
  2048,      0,      0,         0,               1.005
  2048,      0,      0,         1,               1.005
  2048,     11,      0,         0,               1.013
  2048,     11,      0,         1,               1.014
  2048,     43,      0,         0,               1.044
  2048,     43,      0,         1,               1.044
  2048,      0,     11,         0,               1.002
  2048,      0,     11,         1,               1.003
  2048,      0,     43,         0,               1.003
  2048,      0,     43,         1,               1.003
  2048,     11,     11,         0,                0.92
  2048,     11,     11,         1,                0.92
  2048,     43,     43,         0,                 1.0
  2048,     43,     43,         1,                 1.0
  2048,   2048,      0,         0,               1.005
  2048,   2048,      0,         1,               1.005
  2048,   2059,      0,         0,               0.904
  2048,   2059,      0,         1,               0.904
  2048,   2048,     11,         0,                 1.0
  2048,   2048,     11,         1,                 1.0
  2048,   2059,     11,         0,               0.979
  2048,   2059,     11,         1,               0.979
  4096,      0,      0,         0,               1.014
  4096,      0,      0,         1,               1.014
  4096,     12,      0,         0,               0.855
  4096,     12,      0,         1,               0.855
  4096,     44,      0,         0,               0.857
  4096,     44,      0,         1,               0.857
  4096,      0,     12,         0,               0.932
  4096,      0,     12,         1,               0.932
  4096,      0,     44,         0,               0.932
  4096,      0,     44,         1,               0.933
  4096,     12,     12,         0,               0.999
  4096,     12,     12,         1,               0.999
  4096,     44,     44,         0,               1.051
  4096,     44,     44,         1,               1.051
  4096,   2048,      0,         0,               1.014
  4096,   2048,      0,         1,               1.014
  4096,   2060,      0,         0,               0.967
  4096,   2060,      0,         1,               0.967
  4096,   2048,     12,         0,               0.769
  4096,   2048,     12,         1,               0.769
  4096,   2060,     12,         0,               0.943
  4096,   2060,     12,         1,               0.943
  8192,      0,      0,         0,               1.045
  8192,      0,      0,         1,               1.046
  8192,     13,      0,         0,               0.885
  8192,     13,      0,         1,               0.885
  8192,     45,      0,         0,               0.887
  8192,     45,      0,         1,               0.887
  8192,      0,     13,         0,               0.942
  8192,      0,     13,         1,               0.942
  8192,      0,     45,         0,               0.942
  8192,      0,     45,         1,               0.942
  8192,     13,     13,         0,                1.03
  8192,     13,     13,         1,               1.029
  8192,     45,     45,         0,               1.048
  8192,     45,     45,         1,               1.049
  8192,   2048,      0,         0,               1.048
  8192,   2048,      0,         1,               1.048
  8192,   2061,      0,         0,               1.011
  8192,   2061,      0,         1,               1.011
  8192,   2048,     13,         0,               0.789
  8192,   2048,     13,         1,               0.788
  8192,   2061,     13,         0,               0.991
  8192,   2061,     13,         1,               0.992
 16384,      0,      0,         0,               1.026
 16384,      0,      0,         1,               1.011
 16384,     14,      0,         0,               0.943
 16384,     14,      0,         1,                0.95
 16384,     46,      0,         0,               0.856
 16384,     46,      0,         1,                0.86
 16384,      0,     14,         0,               0.815
 16384,      0,     14,         1,               0.817
 16384,      0,     46,         0,               0.859
 16384,      0,     46,         1,               0.867
 16384,     14,     14,         0,               0.987
 16384,     14,     14,         1,               0.979
 16384,     46,     46,         0,               1.027
 16384,     46,     46,         1,               1.031
 16384,   2048,      0,         0,               1.078
 16384,   2048,      0,         1,               1.084
 16384,   2062,      0,         0,               0.851
 16384,   2062,      0,         1,                0.85
 16384,   2048,     14,         0,               0.935
 16384,   2048,     14,         1,               0.932
 16384,   2062,     14,         0,               1.015
 16384,   2062,     14,         1,               1.012
 32768,      0,      0,         0,               0.978
 32768,      0,      0,         1,               0.979
 32768,     15,      0,         0,               1.006
 32768,     15,      0,         1,               1.006
 32768,     47,      0,         0,               1.004
 32768,     47,      0,         1,               1.004
 32768,      0,     15,         0,               1.045
 32768,      0,     15,         1,               1.045
 32768,      0,     47,         0,               1.011
 32768,      0,     47,         1,               1.011
 32768,     15,     15,         0,               0.977
 32768,     15,     15,         1,               0.977
 32768,     47,     47,         0,                0.96
 32768,     47,     47,         1,                0.96
 32768,   2048,      0,         0,               0.978
 32768,   2048,      0,         1,               0.978
 32768,   2063,      0,         0,               1.004
 32768,   2063,      0,         1,               1.004
 32768,   2048,     15,         0,               1.036
 32768,   2048,     15,         1,               1.036
 32768,   2063,     15,         0,               0.978
 32768,   2063,     15,         1,               0.978
 65536,      0,      0,         0,               0.981
 65536,      0,      0,         1,               0.981
 65536,     16,      0,         0,               0.987
 65536,     16,      0,         1,               0.987
 65536,     48,      0,         0,               0.968
 65536,     48,      0,         1,               0.968
 65536,      0,     16,         0,               1.014
 65536,      0,     16,         1,               1.014
 65536,      0,     48,         0,               0.984
 65536,      0,     48,         1,               0.984
 65536,     16,     16,         0,                1.01
 65536,     16,     16,         1,                1.01
 65536,     48,     48,         0,               0.968
 65536,     48,     48,         1,               0.968
 65536,   2048,      0,         0,               0.982
 65536,   2048,      0,         1,               0.982
 65536,   2064,      0,         0,               0.987
 65536,   2064,      0,         1,               0.987
 65536,   2048,     16,         0,               1.012
 65536,   2048,     16,         1,               1.012
 65536,   2064,     16,         0,               1.007
 65536,   2064,     16,         1,               1.007
     0,      0,      0,         0,               0.867
     0,   2048,      0,         0,               0.867
     0,   4095,      0,         0,               0.868
     0,      0,   4095,         0,               0.866
     1,      1,      0,         0,               1.108
     1,      0,      1,         0,               0.946
     1,      1,      1,         0,               0.946
     1,   2049,      0,         0,               0.947
     1,   2048,      1,         0,               0.945
     1,   2049,      1,         0,               0.945
     1,   4095,      0,         0,               1.482
     1,      0,   4095,         0,               0.981
     2,      2,      0,         0,               1.044
     2,      0,      2,         0,               1.041
     2,      2,      2,         0,               1.041
     2,   2050,      0,         0,               1.044
     2,   2048,      2,         0,               1.042
     2,   2050,      2,         0,               1.041
     2,   4095,      0,         0,               1.057
     2,      0,   4095,         0,               1.022
     3,      0,      0,         0,               0.899
     3,      3,      0,         0,               0.902
     3,      0,      3,         0,                 0.9
     3,      3,      3,         0,                 0.9
     3,   2048,      0,         0,                 0.9
     3,   2051,      0,         0,               0.902
     3,   2048,      3,         0,                 0.9
     3,   2051,      3,         0,                 0.9
     3,   4095,      0,         0,               0.261
     3,      0,   4095,         0,               0.211
     4,      4,      0,         0,               0.965
     4,      0,      4,         0,               0.962
     4,      4,      4,         0,               0.962
     4,   2052,      0,         0,               0.969
     4,   2048,      4,         0,               0.962
     4,   2052,      4,         0,               0.962
     4,   4095,      0,         0,               1.971
     4,      0,   4095,         0,               1.988
     5,      0,      0,         0,               0.898
     5,      5,      0,         0,                 0.9
     5,      0,      5,         0,               0.898
     5,      5,      5,         0,               0.898
     5,   2048,      0,         0,               0.898
     5,   2053,      0,         0,                 0.9
     5,   2048,      5,         0,               0.898
     5,   2053,      5,         0,               0.898
     5,   4095,      0,         0,               0.935
     5,      0,   4095,         0,                1.02
     6,      0,      0,         0,               0.898
     6,      6,      0,         0,                 0.9
     6,      0,      6,         0,               0.898
     6,      6,      6,         0,               0.898
     6,   2048,      0,         0,               0.898
     6,   2054,      0,         0,                 0.9
     6,   2048,      6,         0,               0.898
     6,   2054,      6,         0,               0.898
     6,   4095,      0,         0,               0.935
     6,      0,   4095,         0,               1.021
     7,      0,      0,         0,               0.898
     7,      7,      0,         0,                 0.9
     7,      0,      7,         0,               0.898
     7,      7,      7,         0,               0.898
     7,   2048,      0,         0,               0.898
     7,   2055,      0,         0,                 0.9
     7,   2048,      7,         0,               0.898
     7,   2055,      7,         0,               0.898
     7,   4095,      0,         0,               0.935
     7,      0,   4095,         0,               1.021
     8,      8,      0,         0,               1.001
     8,      0,      8,         0,               0.962
     8,      8,      8,         0,               0.962
     8,   2056,      0,         0,                 1.0
     8,   2048,      8,         0,               0.962
     8,   2056,      8,         0,               0.962
     8,   4095,      0,         0,               1.971
     8,      0,   4095,         0,               1.988
     9,      0,      0,         0,               0.898
     9,      9,      0,         0,                 0.9
     9,      0,      9,         0,               0.899
     9,      9,      9,         0,               0.899
     9,   2048,      0,         0,               0.899
     9,   2057,      0,         0,                 0.9
     9,   2048,      9,         0,               0.899
     9,   2057,      9,         0,               0.899
     9,   4095,      0,         0,               0.935
     9,      0,   4095,         0,               1.019
    10,      0,      0,         0,               0.898
    10,     10,      0,         0,                 0.9
    10,      0,     10,         0,               0.899
    10,     10,     10,         0,               0.899
    10,   2048,      0,         0,               0.899
    10,   2058,      0,         0,                 0.9
    10,   2048,     10,         0,               0.899
    10,   2058,     10,         0,               0.899
    10,   4095,      0,         0,               0.935
    10,      0,   4095,         0,                1.02
    11,      0,      0,         0,               0.898
    11,     11,      0,         0,                 0.9
    11,      0,     11,         0,               0.899
    11,     11,     11,         0,               0.899
    11,   2048,      0,         0,               0.899
    11,   2059,      0,         0,                 0.9
    11,   2048,     11,         0,               0.899
    11,   2059,     11,         0,               0.899
    11,   4095,      0,         0,               0.935
    11,      0,   4095,         0,                1.02
    12,      0,      0,         0,               0.898
    12,     12,      0,         0,                 0.9
    12,      0,     12,         0,               0.899
    12,     12,     12,         0,               0.899
    12,   2048,      0,         0,               0.899
    12,   2060,      0,         0,                 0.9
    12,   2048,     12,         0,               0.899
    12,   2060,     12,         0,               0.899
    12,   4095,      0,         0,               0.935
    12,      0,   4095,         0,               1.018
    13,      0,      0,         0,               0.897
    13,     13,      0,         0,               0.901
    13,      0,     13,         0,               0.898
    13,     13,     13,         0,               0.898
    13,   2048,      0,         0,               0.898
    13,   2061,      0,         0,                 0.9
    13,   2048,     13,         0,               0.898
    13,   2061,     13,         0,               0.898
    13,   4095,      0,         0,               0.935
    13,      0,   4095,         0,               1.019
    14,      0,      0,         0,               0.897
    14,     14,      0,         0,                 0.9
    14,      0,     14,         0,               0.898
    14,     14,     14,         0,               0.898
    14,   2048,      0,         0,               0.898
    14,   2062,      0,         0,                 0.9
    14,   2048,     14,         0,               0.898
    14,   2062,     14,         0,               0.898
    14,   4095,      0,         0,               0.935
    14,      0,   4095,         0,                1.02
    15,      0,      0,         0,               0.897
    15,     15,      0,         0,               0.901
    15,      0,     15,         0,               0.898
    15,     15,     15,         0,               0.898
    15,   2048,      0,         0,               0.898
    15,   2063,      0,         0,                 0.9
    15,   2048,     15,         0,               0.898
    15,   2063,     15,         0,               0.898
    15,   4095,      0,         0,               0.935
    15,      0,   4095,         0,                1.02
    16,     16,      0,         0,               0.801
    16,      0,     16,         0,               0.799
    16,     16,     16,         0,               0.799
    16,   2064,      0,         0,               0.801
    16,   2048,     16,         0,               0.799
    16,   2064,     16,         0,               0.799
    16,   4095,      0,         0,               1.818
    16,      0,   4095,         0,               1.957
    17,      0,      0,         0,               0.798
    17,     17,      0,         0,               0.801
    17,      0,     17,         0,               0.799
    17,     17,     17,         0,               0.799
    17,   2048,      0,         0,               0.799
    17,   2065,      0,         0,               0.801
    17,   2048,     17,         0,               0.799
    17,   2065,     17,         0,               0.799
    17,   4095,      0,         0,               0.938
    17,      0,   4095,         0,               1.021
    18,      0,      0,         0,               0.798
    18,     18,      0,         0,               0.801
    18,      0,     18,         0,               0.799
    18,     18,     18,         0,               0.799
    18,   2048,      0,         0,               0.799
    18,   2066,      0,         0,               0.801
    18,   2048,     18,         0,               0.799
    18,   2066,     18,         0,               0.799
    18,   4095,      0,         0,               0.938
    18,      0,   4095,         0,               1.021
    19,      0,      0,         0,               0.798
    19,     19,      0,         0,               0.801
    19,      0,     19,         0,               0.799
    19,     19,     19,         0,               0.799
    19,   2048,      0,         0,               0.799
    19,   2067,      0,         0,               0.801
    19,   2048,     19,         0,               0.799
    19,   2067,     19,         0,               0.799
    19,   4095,      0,         0,               0.938
    19,      0,   4095,         0,               1.021
    20,      0,      0,         0,               0.798
    20,     20,      0,         0,               0.801
    20,      0,     20,         0,               0.799
    20,     20,     20,         0,               0.799
    20,   2048,      0,         0,               0.799
    20,   2068,      0,         0,               0.801
    20,   2048,     20,         0,               0.799
    20,   2068,     20,         0,               0.799
    20,   4095,      0,         0,               0.937
    20,      0,   4095,         0,               1.021
    21,      0,      0,         0,               0.798
    21,     21,      0,         0,               0.801
    21,      0,     21,         0,               0.799
    21,     21,     21,         0,               0.799
    21,   2048,      0,         0,               0.799
    21,   2069,      0,         0,               0.801
    21,   2048,     21,         0,               0.799
    21,   2069,     21,         0,               0.799
    21,   4095,      0,         0,               0.938
    21,      0,   4095,         0,               1.021
    22,      0,      0,         0,               0.798
    22,     22,      0,         0,               0.801
    22,      0,     22,         0,               0.799
    22,     22,     22,         0,               0.799
    22,   2048,      0,         0,               0.799
    22,   2070,      0,         0,               0.801
    22,   2048,     22,         0,               0.799
    22,   2070,     22,         0,               0.799
    22,   4095,      0,         0,               0.938
    22,      0,   4095,         0,               1.021
    23,      0,      0,         0,               0.798
    23,     23,      0,         0,               0.801
    23,      0,     23,         0,               0.799
    23,     23,     23,         0,               0.799
    23,   2048,      0,         0,               0.799
    23,   2071,      0,         0,               0.801
    23,   2048,     23,         0,               0.799
    23,   2071,     23,         0,               0.799
    23,   4095,      0,         0,               0.938
    23,      0,   4095,         0,               1.021
    24,      0,      0,         0,               0.798
    24,     24,      0,         0,               0.801
    24,      0,     24,         0,               0.799
    24,     24,     24,         0,               0.799
    24,   2048,      0,         0,               0.799
    24,   2072,      0,         0,               0.801
    24,   2048,     24,         0,               0.799
    24,   2072,     24,         0,               0.799
    24,   4095,      0,         0,               0.937
    24,      0,   4095,         0,               1.021
    25,      0,      0,         0,               0.501
    25,     25,      0,         0,               0.502
    25,      0,     25,         0,               0.502
    25,     25,     25,         0,               0.501
    25,   2048,      0,         0,               0.501
    25,   2073,      0,         0,               0.502
    25,   2048,     25,         0,               0.502
    25,   2073,     25,         0,               0.501
    25,   4095,      0,         0,               0.974
    25,      0,   4095,         0,                0.98
    26,      0,      0,         0,               0.501
    26,     26,      0,         0,               0.502
    26,      0,     26,         0,               0.502
    26,     26,     26,         0,               0.501
    26,   2048,      0,         0,               0.501
    26,   2074,      0,         0,               0.502
    26,   2048,     26,         0,               0.502
    26,   2074,     26,         0,               0.501
    26,   4095,      0,         0,               0.974
    26,      0,   4095,         0,                 1.0
    27,      0,      0,         0,               0.501
    27,     27,      0,         0,               0.502
    27,      0,     27,         0,               0.502
    27,     27,     27,         0,               0.501
    27,   2048,      0,         0,               0.501
    27,   2075,      0,         0,               0.502
    27,   2048,     27,         0,               0.502
    27,   2075,     27,         0,               0.501
    27,   4095,      0,         0,               0.974
    27,      0,   4095,         0,                 1.0
    28,      0,      0,         0,               0.501
    28,     28,      0,         0,               0.502
    28,      0,     28,         0,               0.502
    28,     28,     28,         0,               0.501
    28,   2048,      0,         0,               0.501
    28,   2076,      0,         0,               0.502
    28,   2048,     28,         0,               0.502
    28,   2076,     28,         0,               0.502
    28,   4095,      0,         0,               0.974
    28,      0,   4095,         0,                 1.0
    29,      0,      0,         0,               0.472
    29,     29,      0,         0,               0.472
    29,      0,     29,         0,               0.472
    29,     29,     29,         0,               0.472
    29,   2048,      0,         0,               0.472
    29,   2077,      0,         0,               0.472
    29,   2048,     29,         0,               0.472
    29,   2077,     29,         0,               0.472
    29,   4095,      0,         0,               0.974
    29,      0,   4095,         0,                 1.0
    30,      0,      0,         0,               0.472
    30,     30,      0,         0,               0.472
    30,      0,     30,         0,               0.472
    30,     30,     30,         0,               0.472
    30,   2048,      0,         0,               0.472
    30,   2078,      0,         0,               0.472
    30,   2048,     30,         0,               0.472
    30,   2078,     30,         0,               0.472
    30,   4095,      0,         0,               0.974
    30,      0,   4095,         0,                 1.0
    31,      0,      0,         0,               0.472
    31,     31,      0,         0,               0.472
    31,      0,     31,         0,               0.472
    31,     31,     31,         0,               0.472
    31,   2048,      0,         0,               0.472
    31,   2079,      0,         0,               0.472
    31,   2048,     31,         0,               0.472
    31,   2079,     31,         0,               0.472
    31,   4095,      0,         0,               0.974
    31,      0,   4095,         0,                 1.0
    48,      0,      0,         0,                 1.0
    48,      0,      0,         1,                 1.0
    48,      3,      0,         0,                 1.0
    48,      3,      0,         1,                 1.0
    48,      0,      3,         0,                 1.0
    48,      0,      3,         1,                 1.0
    48,      3,      3,         0,                 1.0
    48,      3,      3,         1,                 1.0
    48,   2048,      0,         0,                 1.0
    48,   2048,      0,         1,                 1.0
    48,   2051,      0,         0,                 1.0
    48,   2051,      0,         1,                 1.0
    48,   2048,      3,         0,                 1.0
    48,   2048,      3,         1,                 1.0
    48,   2051,      3,         0,                 1.0
    48,   2051,      3,         1,                 1.0
    80,      0,      0,         0,               0.781
    80,      0,      0,         1,               0.782
    80,      5,      0,         0,               0.976
    80,      5,      0,         1,               0.976
    80,      0,      5,         0,               1.232
    80,      0,      5,         1,               1.232
    80,      5,      5,         0,               1.542
    80,      5,      5,         1,               1.543
    80,   2048,      0,         0,               0.781
    80,   2048,      0,         1,               0.782
    80,   2053,      0,         0,               0.976
    80,   2053,      0,         1,               0.976
    80,   2048,      5,         0,               1.093
    80,   2048,      5,         1,               1.093
    80,   2053,      5,         0,               1.371
    80,   2053,      5,         1,               1.371
    96,      0,      0,         0,               0.758
    96,      0,      0,         1,               0.758
    96,      6,      0,         0,               0.929
    96,      6,      0,         1,               0.929
    96,      0,      6,         0,               1.204
    96,      0,      6,         1,               1.204
    96,      6,      6,         0,               1.559
    96,      6,      6,         1,               1.562
    96,   2048,      0,         0,               0.758
    96,   2048,      0,         1,               0.758
    96,   2054,      0,         0,               0.929
    96,   2054,      0,         1,               0.929
    96,   2048,      6,         0,               1.068
    96,   2048,      6,         1,               1.068
    96,   2054,      6,         0,               1.562
    96,   2054,      6,         1,               1.562
   112,      0,      0,         0,               0.736
   112,      0,      0,         1,               0.736
   112,      7,      0,         0,               0.675
   112,      7,      0,         1,               0.675
   112,      0,      7,         0,               0.778
   112,      0,      7,         1,               0.778
   112,      7,      7,         0,               0.909
   112,      7,      7,         1,               0.909
   112,   2048,      0,         0,               0.736
   112,   2048,      0,         1,               0.736
   112,   2055,      0,         0,               0.675
   112,   2055,      0,         1,               0.675
   112,   2048,      7,         0,               0.778
   112,   2048,      7,         1,               0.778
   112,   2055,      7,         0,               0.909
   112,   2055,      7,         1,               0.909
   144,      0,      0,         0,               0.857
   144,      0,      0,         1,               0.857
   144,      9,      0,         0,               0.939
   144,      9,      0,         1,               0.939
   144,      0,      9,         0,               1.137
   144,      0,      9,         1,               1.137
   144,      9,      9,         0,               1.514
   144,      9,      9,         1,               1.514
   144,   2048,      0,         0,               0.857
   144,   2048,      0,         1,               0.857
   144,   2057,      0,         0,               0.939
   144,   2057,      0,         1,               0.939
   144,   2048,      9,         0,               0.922
   144,   2048,      9,         1,               0.922
   144,   2057,      9,         0,               1.514
   144,   2057,      9,         1,               1.514
   160,      0,      0,         0,               0.698
   160,      0,      0,         1,               0.698
   160,     10,      0,         0,                0.91
   160,     10,      0,         1,                0.91
   160,      0,     10,         0,               1.211
   160,      0,     10,         1,               1.212
   160,     10,     10,         0,               1.357
   160,     10,     10,         1,               1.357
   160,   2048,      0,         0,               0.698
   160,   2048,      0,         1,               0.698
   160,   2058,      0,         0,                0.91
   160,   2058,      0,         1,                0.91
   160,   2048,     10,         0,               0.923
   160,   2048,     10,         1,               0.923
   160,   2058,     10,         0,               1.357
   160,   2058,     10,         1,               1.357
   176,      0,      0,         0,               0.796
   176,      0,      0,         1,               0.796
   176,     11,      0,         0,               0.804
   176,     11,      0,         1,               0.804
   176,      0,     11,         0,               0.774
   176,      0,     11,         1,               0.774
   176,     11,     11,         0,               0.814
   176,     11,     11,         1,               0.814
   176,   2048,      0,         0,               0.796
   176,   2048,      0,         1,               0.796
   176,   2059,      0,         0,               0.804
   176,   2059,      0,         1,               0.804
   176,   2048,     11,         0,               0.774
   176,   2048,     11,         1,               0.774
   176,   2059,     11,         0,               0.814
   176,   2059,     11,         1,               0.814
   192,      0,      0,         0,               0.778
   192,      0,      0,         1,               0.778
   192,     12,      0,         0,               0.881
   192,     12,      0,         1,               0.881
   192,      0,     12,         0,               1.167
   192,      0,     12,         1,               1.167
   192,     12,     12,         0,               0.841
   192,     12,     12,         1,               0.841
   192,   2048,      0,         0,               0.778
   192,   2048,      0,         1,               0.778
   192,   2060,      0,         0,               0.881
   192,   2060,      0,         1,               0.881
   192,   2048,     12,         0,               0.889
   192,   2048,     12,         1,               0.889
   192,   2060,     12,         0,               0.906
   192,   2060,     12,         1,               0.906
   208,      0,      0,         0,               0.833
   208,      0,      0,         1,               0.833
   208,     13,      0,         0,               0.921
   208,     13,      0,         1,               0.921
   208,      0,     13,         0,               1.003
   208,      0,     13,         1,                0.85
   208,     13,     13,         0,               1.333
   208,     13,     13,         1,               1.333
   208,   2048,      0,         0,               0.834
   208,   2048,      0,         1,               0.833
   208,   2061,      0,         0,               0.921
   208,   2061,      0,         1,               0.921
   208,   2048,     13,         0,               0.833
   208,   2048,     13,         1,               0.833
   208,   2061,     13,         0,               1.333
   208,   2061,     13,         1,               1.333
   224,      0,      0,         0,                0.93
   224,      0,      0,         1,                0.93
   224,     14,      0,         0,                 1.0
   224,     14,      0,         1,                 1.0
   224,      0,     14,         0,                1.15
   224,      0,     14,         1,                1.15
   224,     14,     14,         0,               1.452
   224,     14,     14,         1,               1.452
   224,   2048,      0,         0,                0.93
   224,   2048,      0,         1,                0.93
   224,   2062,      0,         0,                 1.0
   224,   2062,      0,         1,                 1.0
   224,   2048,     14,         0,               0.833
   224,   2048,     14,         1,               0.833
   224,   2062,     14,         0,               1.452
   224,   2062,     14,         1,               1.452
   240,      0,      0,         0,               0.909
   240,      0,      0,         1,               0.909
   240,     15,      0,         0,               0.797
   240,     15,      0,         1,               0.797
   240,      0,     15,         0,               0.771
   240,      0,     15,         1,               0.771
   240,     15,     15,         0,                0.93
   240,     15,     15,         1,                0.93
   240,   2048,      0,         0,               0.909
   240,   2048,      0,         1,               0.909
   240,   2063,      0,         0,               0.797
   240,   2063,      0,         1,               0.797
   240,   2048,     15,         0,               0.771
   240,   2048,     15,         1,               0.771
   240,   2063,     15,         0,                0.93
   240,   2063,     15,         1,                0.93
   272,      0,      0,         0,                 0.9
   272,      0,      0,         1,                 0.9
   272,     17,      0,         0,               1.015
   272,     17,      0,         1,               1.015
   272,      0,     17,         0,               0.927
   272,      0,     17,         1,               0.927
   272,     17,     17,         0,               0.892
   272,     17,     17,         1,               0.892
   272,   2048,      0,         0,                 0.9
   272,   2048,      0,         1,                 0.9
   272,   2065,      0,         0,               1.015
   272,   2065,      0,         1,               1.015
   272,   2048,     17,         0,               0.927
   272,   2048,     17,         1,               0.927
   272,   2065,     17,         0,               0.878
   272,   2065,     17,         1,               0.878
   288,      0,      0,         0,               0.882
   288,      0,      0,         1,               0.882
   288,     18,      0,         0,               0.803
   288,     18,      0,         1,               0.803
   288,      0,     18,         0,               0.768
   288,      0,     18,         1,               0.768
   288,     18,     18,         0,               0.882
   288,     18,     18,         1,               0.882
   288,   2048,      0,         0,               0.882
   288,   2048,      0,         1,               0.882
   288,   2066,      0,         0,               0.803
   288,   2066,      0,         1,               0.803
   288,   2048,     18,         0,               0.768
   288,   2048,     18,         1,               0.768
   288,   2066,     18,         0,               0.882
   288,   2066,     18,         1,               0.882
   304,      0,      0,         0,               0.865
   304,      0,      0,         1,               0.866
   304,     19,      0,         0,               0.944
   304,     19,      0,         1,               0.944
   304,      0,     19,         0,               0.943
   304,      0,     19,         1,               0.943
   304,     19,     19,         0,               0.956
   304,     19,     19,         1,               0.956
   304,   2048,      0,         0,               0.865
   304,   2048,      0,         1,               0.865
   304,   2067,      0,         0,               0.944
   304,   2067,      0,         1,               0.944
   304,   2048,     19,         0,               0.943
   304,   2048,     19,         1,               0.943
   304,   2067,     19,         0,               0.947
   304,   2067,     19,         1,               0.947
   320,      0,      0,         0,               0.944
   320,      0,      0,         1,               0.944
   320,     20,      0,         0,               0.962
   320,     20,      0,         1,               0.962
   320,      0,     20,         0,               1.214
   320,      0,     20,         1,               1.214
   320,     20,     20,         0,               1.365
   320,     20,     20,         1,               1.365
   320,   2048,      0,         0,               0.944
   320,   2048,      0,         1,               0.944
   320,   2068,      0,         0,               0.962
   320,   2068,      0,         1,               0.962
   320,   2048,     20,         0,               0.914
   320,   2048,     20,         1,               0.914
   320,   2068,     20,         0,               1.365
   320,   2068,     20,         1,               1.365
   336,      0,      0,         0,                 1.0
   336,      0,      0,         1,                 1.0
   336,     21,      0,         0,               0.986
   336,     21,      0,         1,               0.986
   336,      0,     21,         0,               0.853
   336,      0,     21,         1,               0.853
   336,     21,     21,         0,               0.843
   336,     21,     21,         1,               0.843
   336,   2048,      0,         0,                 1.0
   336,   2048,      0,         1,                 1.0
   336,   2069,      0,         0,               0.986
   336,   2069,      0,         1,               0.986
   336,   2048,     21,         0,               0.853
   336,   2048,     21,         1,               0.853
   336,   2069,     21,         0,               0.831
   336,   2069,     21,         1,               0.831
   352,      0,      0,         0,                0.98
   352,      0,      0,         1,                0.98
   352,     22,      0,         0,               0.811
   352,     22,      0,         1,               0.811
   352,      0,     22,         0,               0.882
   352,      0,     22,         1,               0.882
   352,     22,     22,         0,                 1.1
   352,     22,     22,         1,                 1.1
   352,   2048,      0,         0,                0.98
   352,   2048,      0,         1,                0.98
   352,   2070,      0,         0,               0.811
   352,   2070,      0,         1,               0.811
   352,   2048,     22,         0,               0.882
   352,   2048,     22,         1,               0.882
   352,   2070,     22,         0,                 1.1
   352,   2070,     22,         1,                 1.1
   368,      0,      0,         0,               1.058
   368,      0,      0,         1,               1.058
   368,     23,      0,         0,                 1.0
   368,     23,      0,         1,                 1.0
   368,      0,     23,         0,               0.948
   368,      0,     23,         1,               0.948
   368,     23,     23,         0,               0.723
   368,     23,     23,         1,               0.723
   368,   2048,      0,         0,               1.058
   368,   2048,      0,         1,               1.058
   368,   2071,      0,         0,                 1.0
   368,   2071,      0,         1,                 1.0
   368,   2048,     23,         0,               0.948
   368,   2048,     23,         1,               0.948
   368,   2071,     23,         0,               0.701
   368,   2071,     23,         1,               0.701
   384,      0,      0,         0,               1.012
   384,      0,      0,         1,               1.012
   384,     24,      0,         0,                1.04
   384,     24,      0,         1,                1.04
   384,      0,     24,         0,               1.154
   384,      0,     24,         1,               1.154
   384,     24,     24,         0,               1.423
   384,     24,     24,         1,               1.423
   384,   2048,      0,         0,               1.012
   384,   2048,      0,         1,               1.012
   384,   2072,      0,         0,                1.04
   384,   2072,      0,         1,                1.04
   384,   2048,     24,         0,                0.91
   384,   2048,     24,         1,                0.91
   384,   2072,     24,         0,               1.423
   384,   2072,     24,         1,               1.423
   400,      0,      0,         0,               0.948
   400,      0,      0,         1,               0.948
   400,     25,      0,         0,               0.957
   400,     25,      0,         1,               0.957
   400,      0,     25,         0,               1.054
   400,      0,     25,         1,               1.097
   400,     25,     25,         0,               0.885
   400,     25,     25,         1,               0.885
   400,   2048,      0,         0,               0.948
   400,   2048,      0,         1,               0.948
   400,   2073,      0,         0,               0.957
   400,   2073,      0,         1,               0.957
   400,   2048,     25,         0,                0.94
   400,   2048,     25,         1,                0.94
   400,   2073,     25,         0,               0.908
   400,   2073,     25,         1,               0.908
   416,      0,      0,         0,               1.017
   416,      0,      0,         1,               1.017
   416,     26,      0,         0,               0.903
   416,     26,      0,         1,               0.903
   416,      0,     26,         0,               0.881
   416,      0,     26,         1,               0.881
   416,     26,     26,         0,               1.035
   416,     26,     26,         1,               1.035
   416,   2048,      0,         0,               1.017
   416,   2048,      0,         1,               1.017
   416,   2074,      0,         0,               0.903
   416,   2074,      0,         1,               0.903
   416,   2048,     26,         0,               0.881
   416,   2048,     26,         1,               0.881
   416,   2074,     26,         0,               1.035
   416,   2074,     26,         1,               1.035
   432,      0,      0,         0,                 1.0
   432,      0,      0,         1,                 1.0
   432,     27,      0,         0,               0.933
   432,     27,      0,         1,               0.933
   432,      0,     27,         0,               0.941
   432,      0,     27,         1,               0.941
   432,     27,     27,         0,               0.953
   432,     27,     27,         1,               0.954
   432,   2048,      0,         0,                 1.0
   432,   2048,      0,         1,                 1.0
   432,   2075,      0,         0,               0.933
   432,   2075,      0,         1,               0.933
   432,   2048,     27,         0,               0.941
   432,   2048,     27,         1,               0.941
   432,   2075,     27,         0,                0.93
   432,   2075,     27,         1,                0.93
   448,      0,      0,         0,               0.984
   448,      0,      0,         1,               0.984
   448,     28,      0,         0,               0.896
   448,     28,      0,         1,               0.896
   448,      0,     28,         0,               1.244
   448,      0,     28,         1,               1.244
   448,     28,     28,         0,               1.333
   448,     28,     28,         1,               1.333
   448,   2048,      0,         0,               0.984
   448,   2048,      0,         1,               0.984
   448,   2076,      0,         0,               0.896
   448,   2076,      0,         1,               0.896
   448,   2048,     28,         0,               0.988
   448,   2048,     28,         1,               0.988
   448,   2076,     28,         0,               1.333
   448,   2076,     28,         1,               1.333
   464,      0,      0,         0,               1.083
   464,      0,      0,         1,               1.083
   464,     29,      0,         0,               0.978
   464,     29,      0,         1,               0.978
   464,      0,     29,         0,               0.924
   464,      0,     29,         1,               0.924
   464,     29,     29,         0,               0.901
   464,     29,     29,         1,               0.901
   464,   2048,      0,         0,               1.083
   464,   2048,      0,         1,               1.083
   464,   2077,      0,         0,               0.978
   464,   2077,      0,         1,               0.978
   464,   2048,     29,         0,               0.924
   464,   2048,     29,         1,               0.924
   464,   2077,     29,         0,                0.89
   464,   2077,     29,         1,                0.89
   480,      0,      0,         0,               1.066
   480,      0,      0,         1,               1.066
   480,     30,      0,         0,                 0.9
   480,     30,      0,         1,                 0.9
   480,      0,     30,         0,                0.88
   480,      0,     30,         1,                0.88
   480,     30,     30,         0,               1.083
   480,     30,     30,         1,               1.083
   480,   2048,      0,         0,               1.066
   480,   2048,      0,         1,               1.066
   480,   2078,      0,         0,                 0.9
   480,   2078,      0,         1,                 0.9
   480,   2048,     30,         0,                0.88
   480,   2048,     30,         1,                0.88
   480,   2078,     30,         0,               1.083
   480,   2078,     30,         1,               1.083
   496,      0,      0,         0,               1.032
   496,      0,      0,         1,               1.032
   496,     31,      0,         0,                0.95
   496,     31,      0,         1,                0.95
   496,      0,     31,         0,               1.011
   496,      0,     31,         1,               1.011
   496,     31,     31,         0,               0.973
   496,     31,     31,         1,               0.973
   496,   2048,      0,         0,               1.032
   496,   2048,      0,         1,               1.032
   496,   2079,      0,         0,                0.95
   496,   2079,      0,         1,                0.95
   496,   2048,     31,         0,               1.011
   496,   2048,     31,         1,               1.011
   496,   2079,     31,         0,               0.941
   496,   2079,     31,         1,               0.941
  1024,     32,      0,         0,               1.143
  1024,     32,      0,         1,               1.143
  1024,      0,     32,         0,               1.143
  1024,      0,     32,         1,               1.143
  1024,     32,     32,         0,               1.143
  1024,     32,     32,         1,               1.143
  1024,   2080,      0,         0,               1.143
  1024,   2080,      0,         1,               1.143
  1024,   2048,     32,         0,               1.143
  1024,   2048,     32,         1,               1.143
  1024,   2080,     32,         0,               1.143
  1024,   2080,     32,         1,               1.143
  1056,      0,      0,         0,               1.165
  1056,      0,      0,         1,               1.162
  1056,     33,      0,         0,               1.067
  1056,     33,      0,         1,               1.067
  1056,      0,     33,         0,               0.977
  1056,      0,     33,         1,               0.977
  1056,     33,     33,         0,               1.043
  1056,     33,     33,         1,               1.043
  1056,   2048,      0,         0,               1.168
  1056,   2048,      0,         1,               1.168
  1056,   2081,      0,         0,               1.067
  1056,   2081,      0,         1,               1.067
  1056,   2048,     33,         0,               0.977
  1056,   2048,     33,         1,               0.977
  1056,   2081,     33,         0,                 1.0
  1056,   2081,     33,         1,                 1.0
  1088,      0,      0,         0,               1.171
  1088,      0,      0,         1,               1.171
  1088,     34,      0,         0,               1.041
  1088,     34,      0,         1,               1.041
  1088,      0,     34,         0,               1.079
  1088,      0,     34,         1,               1.079
  1088,     34,     34,         0,               0.966
  1088,     34,     34,         1,               0.966
  1088,   2048,      0,         0,               1.171
  1088,   2048,      0,         1,               1.171
  1088,   2082,      0,         0,               1.041
  1088,   2082,      0,         1,               1.041
  1088,   2048,     34,         0,               0.994
  1088,   2048,     34,         1,               0.994
  1088,   2082,     34,         0,               0.966
  1088,   2082,     34,         1,               0.966
  1120,      0,      0,         0,               1.154
  1120,      0,      0,         1,               1.151
  1120,     35,      0,         0,               1.051
  1120,     35,      0,         1,               1.051
  1120,      0,     35,         0,                 1.0
  1120,      0,     35,         1,                 1.0
  1120,     35,     35,         0,               1.068
  1120,     35,     35,         1,               1.068
  1120,   2048,      0,         0,               1.151
  1120,   2048,      0,         1,               1.151
  1120,   2083,      0,         0,               1.051
  1120,   2083,      0,         1,               1.051
  1120,   2048,     35,         0,                 1.0
  1120,   2048,     35,         1,                 1.0
  1120,   2083,     35,         0,               1.027
  1120,   2083,     35,         1,               1.027
  1152,      0,      0,         0,               1.159
  1152,      0,      0,         1,               1.159
  1152,     36,      0,         0,               1.034
  1152,     36,      0,         1,               1.034
  1152,      0,     36,         0,                1.07
  1152,      0,     36,         1,                1.07
  1152,     36,     36,         0,               0.967
  1152,     36,     36,         1,               0.967
  1152,   2048,      0,         0,               1.159
  1152,   2048,      0,         1,               1.159
  1152,   2084,      0,         0,               1.034
  1152,   2084,      0,         1,               1.034
  1152,   2048,     36,         0,               0.984
  1152,   2048,     36,         1,               0.984
  1152,   2084,     36,         0,               0.967
  1152,   2084,     36,         1,               0.967
  1184,      0,      0,         0,               1.157
  1184,      0,      0,         1,               1.157
  1184,     37,      0,         0,               1.066
  1184,     37,      0,         1,               1.066
  1184,      0,     37,         0,               0.993
  1184,      0,     37,         1,               0.993
  1184,     37,     37,         0,                1.08
  1184,     37,     37,         1,               1.081
  1184,   2048,      0,         0,               1.157
  1184,   2048,      0,         1,               1.157
  1184,   2085,      0,         0,               1.066
  1184,   2085,      0,         1,               1.066
  1184,   2048,     37,         0,               0.993
  1184,   2048,     37,         1,               0.993
  1184,   2085,     37,         0,                1.04
  1184,   2085,     37,         1,                1.04
  1216,      0,      0,         0,               1.139
  1216,      0,      0,         1,               1.139
  1216,     38,      0,         0,               1.024
  1216,     38,      0,         1,               1.024
  1216,      0,     38,         0,               1.086
  1216,      0,     38,         1,               1.087
  1216,     38,     38,         0,                 1.0
  1216,     38,     38,         1,                 1.0
  1216,   2048,      0,         0,               1.138
  1216,   2048,      0,         1,               1.138
  1216,   2086,      0,         0,               1.024
  1216,   2086,      0,         1,               1.024
  1216,   2048,     38,         0,                1.01
  1216,   2048,     38,         1,                1.01
  1216,   2086,     38,         0,                 1.0
  1216,   2086,     38,         1,                 1.0
  1248,      0,      0,         0,               1.175
  1248,      0,      0,         1,               1.174
  1248,     39,      0,         0,               1.074
  1248,     39,      0,         1,               1.074
  1248,      0,     39,         0,               0.975
  1248,      0,     39,         1,               0.985
  1248,     39,     39,         0,               1.064
  1248,     39,     39,         1,               1.064
  1248,   2048,      0,         0,               1.179
  1248,   2048,      0,         1,               1.178
  1248,   2087,      0,         0,               1.074
  1248,   2087,      0,         1,               1.074
  1248,   2048,     39,         0,               0.985
  1248,   2048,     39,         1,               0.985
  1248,   2087,     39,         0,               1.026
  1248,   2087,     39,         1,               1.026
  1280,      0,      0,         0,               0.992
  1280,      0,      0,         1,               0.992
  1280,     40,      0,         0,               1.051
  1280,     40,      0,         1,               1.051
  1280,      0,     40,         0,               1.044
  1280,      0,     40,         1,               1.044
  1280,     40,     40,         0,               1.252
  1280,     40,     40,         1,               1.252
  1280,   2048,      0,         0,               0.992
  1280,   2048,      0,         1,               0.992
  1280,   2088,      0,         0,               1.051
  1280,   2088,      0,         1,               1.051
  1280,   2048,     40,         0,               0.946
  1280,   2048,     40,         1,               0.946
  1280,   2088,     40,         0,               1.252
  1280,   2088,     40,         1,               1.252
  1312,      0,      0,         0,               0.969
  1312,      0,      0,         1,               0.969
  1312,     41,      0,         0,               0.988
  1312,     41,      0,         1,               0.988
  1312,      0,     41,         0,               0.837
  1312,      0,     41,         1,               0.837
  1312,     41,     41,         0,               1.025
  1312,     41,     41,         1,               1.025
  1312,   2048,      0,         0,               0.969
  1312,   2048,      0,         1,               0.969
  1312,   2089,      0,         0,               0.988
  1312,   2089,      0,         1,               0.987
  1312,   2048,     41,         0,               0.837
  1312,   2048,     41,         1,               0.837
  1312,   2089,     41,         0,               0.975
  1312,   2089,     41,         1,               0.975
  1344,      0,      0,         0,               0.987
  1344,      0,      0,         1,               0.988
  1344,     42,      0,         0,               1.031
  1344,     42,      0,         1,               1.031
  1344,      0,     42,         0,               1.033
  1344,      0,     42,         1,               1.033
  1344,     42,     42,         0,               0.982
  1344,     42,     42,         1,               0.982
  1344,   2048,      0,         0,               0.992
  1344,   2048,      0,         1,               0.992
  1344,   2090,      0,         0,               1.031
  1344,   2090,      0,         1,               1.031
  1344,   2048,     42,         0,               0.943
  1344,   2048,     42,         1,               0.943
  1344,   2090,     42,         0,               0.982
  1344,   2090,     42,         1,               0.982
  1376,      0,      0,         0,               1.016
  1376,      0,      0,         1,               1.016
  1376,     43,      0,         0,               1.005
  1376,     43,      0,         1,               1.005
  1376,      0,     43,         0,               0.829
  1376,      0,     43,         1,               0.829
  1376,     43,     43,         0,               1.024
  1376,     43,     43,         1,               1.024
  1376,   2048,      0,         0,               1.005
  1376,   2048,      0,         1,               1.013
  1376,   2091,      0,         0,               1.005
  1376,   2091,      0,         1,               1.005
  1376,   2048,     43,         0,               0.829
  1376,   2048,     43,         1,               0.829
  1376,   2091,     43,         0,                0.98
  1376,   2091,     43,         1,                0.98
  1408,      0,      0,         0,               0.988
  1408,      0,      0,         1,               0.988
  1408,     44,      0,         0,               1.015
  1408,     44,      0,         1,               1.015
  1408,      0,     44,         0,               1.023
  1408,      0,     44,         1,                1.03
  1408,     44,     44,         0,               0.998
  1408,     44,     44,         1,               0.994
  1408,   2048,      0,         0,               0.988
  1408,   2048,      0,         1,               0.988
  1408,   2092,      0,         0,               1.015
  1408,   2092,      0,         1,               1.015
  1408,   2048,     44,         0,               0.955
  1408,   2048,     44,         1,               0.955
  1408,   2092,     44,         0,               0.999
  1408,   2092,     44,         1,               0.994
  1440,      0,      0,         0,               0.986
  1440,      0,      0,         1,               0.986
  1440,     45,      0,         0,               1.008
  1440,     45,      0,         1,               1.008
  1440,      0,     45,         0,               0.814
  1440,      0,     45,         1,               0.814
  1440,     45,     45,         0,               1.006
  1440,     45,     45,         1,               1.006
  1440,   2048,      0,         0,               0.986
  1440,   2048,      0,         1,               0.986
  1440,   2093,      0,         0,               1.008
  1440,   2093,      0,         1,               1.008
  1440,   2048,     45,         0,               0.814
  1440,   2048,     45,         1,               0.814
  1440,   2093,     45,         0,               0.966
  1440,   2093,     45,         1,               0.966
  1472,      0,      0,         0,               0.993
  1472,      0,      0,         1,               0.992
  1472,     46,      0,         0,               1.045
  1472,     46,      0,         1,               1.045
  1472,      0,     46,         0,               1.026
  1472,      0,     46,         1,               1.026
  1472,     46,     46,         0,               0.966
  1472,     46,     46,         1,               0.966
  1472,   2048,      0,         0,               0.999
  1472,   2048,      0,         1,               0.997
  1472,   2094,      0,         0,               1.045
  1472,   2094,      0,         1,               1.045
  1472,   2048,     46,         0,               0.939
  1472,   2048,     46,         1,               0.939
  1472,   2094,     46,         0,               0.966
  1472,   2094,     46,         1,               0.966
  1504,      0,      0,         0,               0.991
  1504,      0,      0,         1,               0.991
  1504,     47,      0,         0,               0.999
  1504,     47,      0,         1,               0.999
  1504,      0,     47,         0,               0.826
  1504,      0,     47,         1,               0.826
  1504,     47,     47,         0,               1.023
  1504,     47,     47,         1,               1.023
  1504,   2048,      0,         0,               0.993
  1504,   2048,      0,         1,               0.993
  1504,   2095,      0,         0,               0.999
  1504,   2095,      0,         1,               0.999
  1504,   2048,     47,         0,               0.826
  1504,   2048,     47,         1,               0.826
  1504,   2095,     47,         0,               0.993
  1504,   2095,     47,         1,               0.993
  1536,      0,      0,         0,               0.994
  1536,      0,      0,         1,               0.993
  1536,     48,      0,         0,               1.019
  1536,     48,      0,         1,               1.019
  1536,      0,     48,         0,               1.025
  1536,      0,     48,         1,               1.025
  1536,     48,     48,         0,               0.993
  1536,     48,     48,         1,               0.993
  1536,   2048,      0,         0,               0.994
  1536,   2048,      0,         1,               0.994
  1536,   2096,      0,         0,               1.019
  1536,   2096,      0,         1,               1.019
  1536,   2048,     48,         0,               1.025
  1536,   2048,     48,         1,               1.025
  1536,   2096,     48,         0,               0.994
  1536,   2096,     48,         1,               0.994
  1568,      0,      0,         0,               0.994
  1568,      0,      0,         1,               0.994
  1568,     49,      0,         0,               0.903
  1568,     49,      0,         1,               0.903
  1568,      0,     49,         0,               1.147
  1568,      0,     49,         1,               1.147
  1568,     49,     49,         0,               1.461
  1568,     49,     49,         1,                1.46
  1568,   2048,      0,         0,               0.994
  1568,   2048,      0,         1,               0.993
  1568,   2097,      0,         0,               0.903
  1568,   2097,      0,         1,               0.903
  1568,   2048,     49,         0,                1.09
  1568,   2048,     49,         1,                1.09
  1568,   2097,     49,         0,                1.46
  1568,   2097,     49,         1,                1.46
  1600,      0,      0,         0,               0.981
  1600,      0,      0,         1,               0.981
  1600,     50,      0,         0,               1.022
  1600,     50,      0,         1,               1.022
  1600,      0,     50,         0,               1.017
  1600,      0,     50,         1,               1.017
  1600,     50,     50,         0,               0.973
  1600,     50,     50,         1,               0.973
  1600,   2048,      0,         0,               0.981
  1600,   2048,      0,         1,               0.981
  1600,   2098,      0,         0,               1.022
  1600,   2098,      0,         1,               1.022
  1600,   2048,     50,         0,               0.961
  1600,   2048,     50,         1,               0.961
  1600,   2098,     50,         0,               0.973
  1600,   2098,     50,         1,               0.973
  1632,      0,      0,         0,               1.018
  1632,      0,      0,         1,               1.018
  1632,     51,      0,         0,               0.893
  1632,     51,      0,         1,               0.893
  1632,      0,     51,         0,               1.134
  1632,      0,     51,         1,               1.134
  1632,     51,     51,         0,               1.444
  1632,     51,     51,         1,               1.444
  1632,   2048,      0,         0,               1.019
  1632,   2048,      0,         1,               1.019
  1632,   2099,      0,         0,               0.893
  1632,   2099,      0,         1,               0.893
  1632,   2048,     51,         0,               1.079
  1632,   2048,     51,         1,               1.079
  1632,   2099,     51,         0,               1.449
  1632,   2099,     51,         1,               1.449
  1664,      0,      0,         0,               1.006
  1664,      0,      0,         1,               1.006
  1664,     52,      0,         0,               0.982
  1664,     52,      0,         1,               0.986
  1664,      0,     52,         0,               1.004
  1664,      0,     52,         1,               1.004
  1664,     52,     52,         0,               0.976
  1664,     52,     52,         1,               0.976
  1664,   2048,      0,         0,               1.006
  1664,   2048,      0,         1,               1.006
  1664,   2100,      0,         0,               0.983
  1664,   2100,      0,         1,               0.983
  1664,   2048,     52,         0,               0.946
  1664,   2048,     52,         1,               0.946
  1664,   2100,     52,         0,               0.976
  1664,   2100,     52,         1,               0.976
  1696,      0,      0,         0,                0.99
  1696,      0,      0,         1,                0.99
  1696,     53,      0,         0,               0.884
  1696,     53,      0,         1,               0.884
  1696,      0,     53,         0,               1.141
  1696,      0,     53,         1,               1.141
  1696,     53,     53,         0,                1.43
  1696,     53,     53,         1,               1.428
  1696,   2048,      0,         0,               0.994
  1696,   2048,      0,         1,               0.993
  1696,   2101,      0,         0,               0.884
  1696,   2101,      0,         1,               0.884
  1696,   2048,     53,         0,               1.088
  1696,   2048,     53,         1,               1.088
  1696,   2101,     53,         0,               1.429
  1696,   2101,     53,         1,               1.429
  1728,      0,      0,         0,               0.978
  1728,      0,      0,         1,               0.977
  1728,     54,      0,         0,               1.032
  1728,     54,      0,         1,               1.033
  1728,      0,     54,         0,                 1.0
  1728,      0,     54,         1,                 1.0
  1728,     54,     54,         0,                0.96
  1728,     54,     54,         1,                0.96
  1728,   2048,      0,         0,               0.976
  1728,   2048,      0,         1,               0.976
  1728,   2102,      0,         0,               1.033
  1728,   2102,      0,         1,               1.033
  1728,   2048,     54,         0,               0.947
  1728,   2048,     54,         1,               0.947
  1728,   2102,     54,         0,                0.96
  1728,   2102,     54,         1,                0.96
  1760,      0,      0,         0,               1.019
  1760,      0,      0,         1,               1.022
  1760,     55,      0,         0,                 0.9
  1760,     55,      0,         1,                 0.9
  1760,      0,     55,         0,               1.125
  1760,      0,     55,         1,               1.125
  1760,     55,     55,         0,               1.438
  1760,     55,     55,         1,               1.439
  1760,   2048,      0,         0,               1.015
  1760,   2048,      0,         1,               1.015
  1760,   2103,      0,         0,                 0.9
  1760,   2103,      0,         1,                 0.9
  1760,   2048,     55,         0,               1.073
  1760,   2048,     55,         1,               1.074
  1760,   2103,     55,         0,               1.435
  1760,   2103,     55,         1,                1.44
  1792,      0,      0,         0,               1.003
  1792,      0,      0,         1,               1.002
  1792,     56,      0,         0,               1.028
  1792,     56,      0,         1,               1.028
  1792,      0,     56,         0,               1.014
  1792,      0,     56,         1,               1.015
  1792,     56,     56,         0,               1.191
  1792,     56,     56,         1,               1.191
  1792,   2048,      0,         0,               1.003
  1792,   2048,      0,         1,               1.003
  1792,   2104,      0,         0,               1.028
  1792,   2104,      0,         1,               1.028
  1792,   2048,     56,         0,               0.963
  1792,   2048,     56,         1,               0.963
  1792,   2104,     56,         0,               1.191
  1792,   2104,     56,         1,               1.191
  1824,      0,      0,         0,               1.001
  1824,      0,      0,         1,               1.001
  1824,     57,      0,         0,               0.891
  1824,     57,      0,         1,               0.891
  1824,      0,     57,         0,               1.114
  1824,      0,     57,         1,               1.114
  1824,     57,     57,         0,               1.407
  1824,     57,     57,         1,               1.407
  1824,   2048,      0,         0,               1.001
  1824,   2048,      0,         1,               1.001
  1824,   2105,      0,         0,               0.891
  1824,   2105,      0,         1,               0.891
  1824,   2048,     57,         0,               1.064
  1824,   2048,     57,         1,               1.064
  1824,   2105,     57,         0,               1.407
  1824,   2105,     57,         1,               1.407
  1856,      0,      0,         0,               0.991
  1856,      0,      0,         1,               0.991
  1856,     58,      0,         0,               1.042
  1856,     58,      0,         1,               1.042
  1856,      0,     58,         0,               1.007
  1856,      0,     58,         1,               1.007
  1856,     58,     58,         0,                0.98
  1856,     58,     58,         1,               0.972
  1856,   2048,      0,         0,               0.992
  1856,   2048,      0,         1,               0.992
  1856,   2106,      0,         0,               1.042
  1856,   2106,      0,         1,               1.042
  1856,   2048,     58,         0,               0.954
  1856,   2048,     58,         1,               0.954
  1856,   2106,     58,         0,                0.98
  1856,   2106,     58,         1,               0.972
  1888,      0,      0,         0,               0.993
  1888,      0,      0,         1,               0.992
  1888,     59,      0,         0,               0.883
  1888,     59,      0,         1,               0.883
  1888,      0,     59,         0,               1.124
  1888,      0,     59,         1,               1.125
  1888,     59,     59,         0,               1.413
  1888,     59,     59,         1,               1.413
  1888,   2048,      0,         0,               0.986
  1888,   2048,      0,         1,               0.991
  1888,   2107,      0,         0,               0.883
  1888,   2107,      0,         1,               0.883
  1888,   2048,     59,         0,               1.076
  1888,   2048,     59,         1,               1.076
  1888,   2107,     59,         0,               1.413
  1888,   2107,     59,         1,               1.413
  1920,      0,      0,         0,                 1.0
  1920,      0,      0,         1,                 1.0
  1920,     60,      0,         0,               1.033
  1920,     60,      0,         1,               1.034
  1920,      0,     60,         0,               0.996
  1920,      0,     60,         1,               0.997
  1920,     60,     60,         0,               0.968
  1920,     60,     60,         1,               0.968
  1920,   2048,      0,         0,                 1.0
  1920,   2048,      0,         1,                 1.0
  1920,   2108,      0,         0,               1.034
  1920,   2108,      0,         1,               1.034
  1920,   2048,     60,         0,               0.949
  1920,   2048,     60,         1,               0.949
  1920,   2108,     60,         0,               0.968
  1920,   2108,     60,         1,               0.968
  1952,      0,      0,         0,               1.004
  1952,      0,      0,         1,               1.004
  1952,     61,      0,         0,               0.897
  1952,     61,      0,         1,               0.898
  1952,      0,     61,         0,               1.118
  1952,      0,     61,         1,               1.118
  1952,     61,     61,         0,               1.387
  1952,     61,     61,         1,               1.387
  1952,   2048,      0,         0,               1.004
  1952,   2048,      0,         1,               1.004
  1952,   2109,      0,         0,               0.898
  1952,   2109,      0,         1,               0.898
  1952,   2048,     61,         0,               1.071
  1952,   2048,     61,         1,               1.071
  1952,   2109,     61,         0,               1.387
  1952,   2109,     61,         1,               1.387
  1984,      0,      0,         0,               0.993
  1984,      0,      0,         1,               0.993
  1984,     62,      0,         0,               1.025
  1984,     62,      0,         1,               1.025
  1984,      0,     62,         0,               1.005
  1984,      0,     62,         1,               1.007
  1984,     62,     62,         0,               0.982
  1984,     62,     62,         1,               0.982
  1984,   2048,      0,         0,               0.993
  1984,   2048,      0,         1,               0.993
  1984,   2110,      0,         0,               1.025
  1984,   2110,      0,         1,               1.025
  1984,   2048,     62,         0,                0.96
  1984,   2048,     62,         1,                0.96
  1984,   2110,     62,         0,               0.982
  1984,   2110,     62,         1,               0.982
  2016,      0,      0,         0,               0.999
  2016,      0,      0,         1,               0.999
  2016,     63,      0,         0,               0.889
  2016,     63,      0,         1,                0.89
  2016,      0,     63,         0,               1.093
  2016,      0,     63,         1,               1.094
  2016,     63,     63,         0,               1.362
  2016,     63,     63,         1,               1.363
  2016,   2048,      0,         0,                 1.0
  2016,   2048,      0,         1,                 1.0
  2016,   2111,      0,         0,               0.965
  2016,   2111,      0,         1,               0.965
  2016,   2048,     63,         0,               1.049
  2016,   2048,     63,         1,               1.049
  2016,   2111,     63,         0,               1.405
  2016,   2111,     63,         1,               1.405
  2048,     32,      0,         0,                1.01
  2048,     32,      0,         1,                1.01
  2048,      0,     32,         0,               1.005
  2048,      0,     32,         1,               1.005
  2048,     32,     32,         0,               1.005
  2048,     32,     32,         1,               1.005
  2048,      0,      1,         0,               0.983
  2048,      0,      1,         1,               0.984
  2048,      1,      0,         0,               1.039
  2048,      1,      0,         1,               1.039
  2048,     32,      1,         0,               1.063
  2048,     32,      1,         1,               1.063
  2048,      1,     32,         0,                0.94
  2048,      1,     32,         1,                0.94
  2048,   2048,      1,         0,               0.981
  2048,   2048,      1,         1,               0.981
  2048,   2049,      0,         0,               0.904
  2048,   2049,      0,         1,               0.904
  2112,      0,      0,         0,               0.996
  2112,      0,      0,         1,               0.996
  2112,      1,      0,         0,               1.031
  2112,      1,      0,         1,               1.031
  2112,     33,      0,         0,                1.01
  2112,     33,      0,         1,                1.01
  2112,      0,      1,         0,               0.972
  2112,      0,      1,         1,               0.972
  2112,      0,     33,         0,               0.988
  2112,      0,     33,         1,               0.988
  2112,      1,      1,         0,               0.914
  2112,      1,      1,         1,               0.914
  2112,     33,     33,         0,               0.983
  2112,     33,     33,         1,               0.983
  2112,   2048,      0,         0,               0.993
  2112,   2048,      0,         1,               0.991
  2112,   2049,      0,         0,               1.031
  2112,   2049,      0,         1,               1.031
  2112,   2048,      1,         0,               0.955
  2112,   2048,      1,         1,               0.955
  2112,   2049,      1,         0,               0.906
  2112,   2049,      1,         1,               0.906
  2112,     33,      1,         0,               1.163
  2112,     33,      1,         1,               1.164
  2112,      1,     33,         0,               1.046
  2112,      1,     33,         1,               1.046
  2176,      0,      0,         0,               0.985
  2176,      0,      0,         1,               0.985
  2176,      2,      0,         0,               1.023
  2176,      2,      0,         1,               1.023
  2176,     34,      0,         0,                 1.0
  2176,     34,      0,         1,                 1.0
  2176,      0,      2,         0,               0.984
  2176,      0,      2,         1,               0.985
  2176,      0,     34,         0,               0.986
  2176,      0,     34,         1,               0.993
  2176,      2,      2,         0,               0.928
  2176,      2,      2,         1,               0.928
  2176,     34,     34,         0,               1.004
  2176,     34,     34,         1,               1.004
  2176,   2048,      0,         0,               0.985
  2176,   2048,      0,         1,               0.985
  2176,   2050,      0,         0,               1.023
  2176,   2050,      0,         1,               1.023
  2176,   2048,      2,         0,               0.802
  2176,   2048,      2,         1,               0.802
  2176,   2050,      2,         0,               0.894
  2176,   2050,      2,         1,               0.894
  2176,      2,      1,         0,               1.068
  2176,      2,      1,         1,               1.068
  2176,      1,      2,         0,               0.976
  2176,      1,      2,         1,               0.976
  2176,     34,      1,         0,               1.077
  2176,     34,      1,         1,               1.077
  2176,      1,     34,         0,               0.978
  2176,      1,     34,         1,               0.978
  2176,   2050,      1,         0,               1.061
  2176,   2050,      1,         1,               1.061
  2176,   2049,      2,         0,               0.971
  2176,   2049,      2,         1,               0.971
  2240,      0,      0,         0,               0.994
  2240,      0,      0,         1,               0.994
  2240,      3,      0,         0,               1.038
  2240,      3,      0,         1,               1.039
  2240,     35,      0,         0,               1.019
  2240,     35,      0,         1,               1.019
  2240,      0,      3,         0,               0.979
  2240,      0,      3,         1,                0.98
  2240,      0,     35,         0,               0.991
  2240,      0,     35,         1,               0.991
  2240,      3,      3,         0,               0.931
  2240,      3,      3,         1,               0.931
  2240,     35,     35,         0,               0.999
  2240,     35,     35,         1,               0.999
  2240,   2048,      0,         0,               0.995
  2240,   2048,      0,         1,               0.995
  2240,   2051,      0,         0,               1.039
  2240,   2051,      0,         1,               1.039
  2240,   2048,      3,         0,               0.799
  2240,   2048,      3,         1,               0.799
  2240,   2051,      3,         0,               0.889
  2240,   2051,      3,         1,               0.889
  2240,      3,      1,         0,                1.06
  2240,      3,      1,         1,                1.06
  2240,      1,      3,         0,               0.968
  2240,      1,      3,         1,               0.968
  2240,     35,      1,         0,               1.071
  2240,     35,      1,         1,               1.071
  2240,      1,     35,         0,               0.971
  2240,      1,     35,         1,               0.971
  2240,   2051,      1,         0,               1.057
  2240,   2051,      1,         1,               1.057
  2240,   2049,      3,         0,               0.966
  2240,   2049,      3,         1,               0.966
  2304,      0,      0,         0,               0.988
  2304,      0,      0,         1,               0.988
  2304,      4,      0,         0,               1.031
  2304,      4,      0,         1,               1.032
  2304,     36,      0,         0,               1.011
  2304,     36,      0,         1,               1.011
  2304,      0,      4,         0,               0.968
  2304,      0,      4,         1,               0.967
  2304,      0,     36,         0,               0.988
  2304,      0,     36,         1,               0.988
  2304,      4,      4,         0,               0.931
  2304,      4,      4,         1,               0.931
  2304,     36,     36,         0,               0.992
  2304,     36,     36,         1,               0.992
  2304,   2048,      0,         0,               0.988
  2304,   2048,      0,         1,               0.988
  2304,   2052,      0,         0,               1.032
  2304,   2052,      0,         1,               1.032
  2304,   2048,      4,         0,               0.793
  2304,   2048,      4,         1,               0.793
  2304,   2052,      4,         0,               0.884
  2304,   2052,      4,         1,               0.884
  2304,      4,      1,         0,               0.989
  2304,      4,      1,         1,               0.989
  2304,      1,      4,         0,               0.897
  2304,      1,      4,         1,               0.898
  2304,     36,      1,         0,               1.057
  2304,     36,      1,         1,               1.057
  2304,      1,     36,         0,               0.966
  2304,      1,     36,         1,               0.966
  2304,   2052,      1,         0,               1.052
  2304,   2052,      1,         1,               1.052
  2304,   2049,      4,         0,               0.955
  2304,   2049,      4,         1,               0.955
  2368,      0,      0,         0,               0.999
  2368,      0,      0,         1,                 1.0
  2368,      5,      0,         0,               1.024
  2368,      5,      0,         1,               1.025
  2368,     37,      0,         0,                 1.0
  2368,     37,      0,         1,                 1.0
  2368,      0,      5,         0,                0.98
  2368,      0,      5,         1,               0.981
  2368,      0,     37,         0,               0.986
  2368,      0,     37,         1,               0.981
  2368,      5,      5,         0,               0.944
  2368,      5,      5,         1,               0.944
  2368,     37,     37,         0,               1.003
  2368,     37,     37,         1,               1.003
  2368,   2048,      0,         0,               1.002
  2368,   2048,      0,         1,               1.002
  2368,   2053,      0,         0,               1.025
  2368,   2053,      0,         1,               1.025
  2368,   2048,      5,         0,               0.801
  2368,   2048,      5,         1,               0.801
  2368,   2053,      5,         0,               0.907
  2368,   2053,      5,         1,               0.907
  2368,      5,      1,         0,               1.071
  2368,      5,      1,         1,               1.071
  2368,      1,      5,         0,               0.973
  2368,      1,      5,         1,               0.973
  2368,     37,      1,         0,                1.07
  2368,     37,      1,         1,                1.07
  2368,      1,     37,         0,               0.974
  2368,      1,     37,         1,               0.974
  2368,   2053,      1,         0,               1.065
  2368,   2053,      1,         1,               1.065
  2368,   2049,      5,         0,               0.967
  2368,   2049,      5,         1,               0.967
  2432,      0,      0,         0,               0.968
  2432,      0,      0,         1,               1.002
  2432,      6,      0,         0,               1.032
  2432,      6,      0,         1,               1.033
  2432,     38,      0,         0,               1.021
  2432,     38,      0,         1,               1.021
  2432,      0,      6,         0,               0.973
  2432,      0,      6,         1,               0.976
  2432,      0,     38,         0,               0.986
  2432,      0,     38,         1,               0.986
  2432,      6,      6,         0,               0.926
  2432,      6,      6,         1,               0.926
  2432,     38,     38,         0,                 1.0
  2432,     38,     38,         1,                 1.0
  2432,   2048,      0,         0,               1.005
  2432,   2048,      0,         1,               1.004
  2432,   2054,      0,         0,               1.032
  2432,   2054,      0,         1,               1.033
  2432,   2048,      6,         0,               0.797
  2432,   2048,      6,         1,               0.797
  2432,   2054,      6,         0,               0.898
  2432,   2054,      6,         1,               0.898
  2432,      6,      1,         0,               1.058
  2432,      6,      1,         1,               1.058
  2432,      1,      6,         0,                0.96
  2432,      1,      6,         1,                0.96
  2432,     38,      1,         0,               1.062
  2432,     38,      1,         1,               1.062
  2432,      1,     38,         0,               0.963
  2432,      1,     38,         1,               0.963
  2432,   2054,      1,         0,               1.054
  2432,   2054,      1,         1,               1.054
  2432,   2049,      6,         0,               0.957
  2432,   2049,      6,         1,               0.957
  2496,      0,      0,         0,               1.013
  2496,      0,      0,         1,               1.013
  2496,      7,      0,         0,               1.025
  2496,      7,      0,         1,               1.026
  2496,     39,      0,         0,               1.013
  2496,     39,      0,         1,               1.013
  2496,      0,      7,         0,               0.964
  2496,      0,      7,         1,               0.966
  2496,      0,     39,         0,               0.979
  2496,      0,     39,         1,               0.979
  2496,      7,      7,         0,               0.925
  2496,      7,      7,         1,               0.925
  2496,     39,     39,         0,               0.989
  2496,     39,     39,         1,               0.989
  2496,   2048,      0,         0,               1.013
  2496,   2048,      0,         1,               1.013
  2496,   2055,      0,         0,               1.026
  2496,   2055,      0,         1,               1.026
  2496,   2048,      7,         0,               0.792
  2496,   2048,      7,         1,               0.792
  2496,   2055,      7,         0,                0.93
  2496,   2055,      7,         1,                0.93
  2496,      7,      1,         0,               0.982
  2496,      7,      1,         1,               0.982
  2496,      1,      7,         0,               0.893
  2496,      1,      7,         1,               0.893
  2496,     39,      1,         0,               1.048
  2496,     39,      1,         1,               1.049
  2496,      1,     39,         0,               0.958
  2496,      1,     39,         1,               0.958
  2496,   2055,      1,         0,               1.042
  2496,   2055,      1,         1,               1.042
  2496,   2049,      7,         0,               0.947
  2496,   2049,      7,         1,               0.947
  2560,      0,      0,         0,               0.993
  2560,      0,      0,         1,               0.993
  2560,      8,      0,         0,               1.031
  2560,      8,      0,         1,               1.032
  2560,     40,      0,         0,               1.029
  2560,     40,      0,         1,               1.029
  2560,      0,      8,         0,               0.992
  2560,      0,      8,         1,               0.992
  2560,      0,     40,         0,               0.981
  2560,      0,     40,         1,                0.98
  2560,      8,      8,         0,               0.943
  2560,      8,      8,         1,               0.942
  2560,     40,     40,         0,               1.141
  2560,     40,     40,         1,               1.141
  2560,   2048,      0,         0,               0.993
  2560,   2048,      0,         1,               0.993
  2560,   2056,      0,         0,               1.032
  2560,   2056,      0,         1,               1.032
  2560,   2048,      8,         0,               0.812
  2560,   2048,      8,         1,               0.812
  2560,   2056,      8,         0,               0.912
  2560,   2056,      8,         1,               0.912
  2560,      8,      1,         0,               1.069
  2560,      8,      1,         1,               1.069
  2560,      1,      8,         0,               0.974
  2560,      1,      8,         1,               0.974
  2560,     40,      1,         0,               1.068
  2560,     40,      1,         1,               1.068
  2560,      1,     40,         0,               0.996
  2560,      1,     40,         1,               0.996
  2560,   2056,      1,         0,               1.063
  2560,   2056,      1,         1,               1.063
  2560,   2049,      8,         0,               0.969
  2560,   2049,      8,         1,               0.969
  2624,      0,      0,         0,               0.997
  2624,      0,      0,         1,               0.997
  2624,      9,      0,         0,               1.008
  2624,      9,      0,         1,               1.012
  2624,     41,      0,         0,               1.044
  2624,     41,      0,         1,               1.044
  2624,      0,      9,         0,               0.988
  2624,      0,      9,         1,                0.99
  2624,      0,     41,         0,                0.99
  2624,      0,     41,         1,                0.99
  2624,      9,      9,         0,               0.943
  2624,      9,      9,         1,               0.943
  2624,     41,     41,         0,               0.993
  2624,     41,     41,         1,               0.993
  2624,   2048,      0,         0,               0.998
  2624,   2048,      0,         1,               0.998
  2624,   2057,      0,         0,               1.012
  2624,   2057,      0,         1,               1.012
  2624,   2048,      9,         0,                0.81
  2624,   2048,      9,         1,                0.81
  2624,   2057,      9,         0,               0.907
  2624,   2057,      9,         1,               0.907
  2624,      9,      1,         0,               1.085
  2624,      9,      1,         1,               1.084
  2624,      1,      9,         0,               0.962
  2624,      1,      9,         1,               0.963
  2624,     41,      1,         0,               1.078
  2624,     41,      1,         1,               1.078
  2624,      1,     41,         0,               0.962
  2624,      1,     41,         1,               0.962
  2624,   2057,      1,         0,               1.081
  2624,   2057,      1,         1,               1.081
  2624,   2049,      9,         0,               0.959
  2624,   2049,      9,         1,               0.959
  2688,      0,      0,         0,               0.995
  2688,      0,      0,         1,               0.995
  2688,     10,      0,         0,               1.003
  2688,     10,      0,         1,               1.006
  2688,     42,      0,         0,               1.036
  2688,     42,      0,         1,               1.036
  2688,      0,     10,         0,               0.978
  2688,      0,     10,         1,               0.979
  2688,      0,     42,         0,               0.978
  2688,      0,     42,         1,               0.977
  2688,     10,     10,         0,               0.942
  2688,     10,     10,         1,               0.942
  2688,     42,     42,         0,               0.989
  2688,     42,     42,         1,               0.989
  2688,   2048,      0,         0,               0.995
  2688,   2048,      0,         1,               0.995
  2688,   2058,      0,         0,               1.006
  2688,   2058,      0,         1,               1.006
  2688,   2048,     10,         0,               0.804
  2688,   2048,     10,         1,               0.804
  2688,   2058,     10,         0,               0.905
  2688,   2058,     10,         1,               0.905
  2688,     10,      1,         0,               0.985
  2688,     10,      1,         1,               0.985
  2688,      1,     10,         0,               0.892
  2688,      1,     10,         1,               0.892
  2688,     42,      1,         0,               1.048
  2688,     42,      1,         1,               1.048
  2688,      1,     42,         0,               0.958
  2688,      1,     42,         1,               0.958
  2688,   2058,      1,         0,               1.046
  2688,   2058,      1,         1,               1.046
  2688,   2049,     10,         0,               0.948
  2688,   2049,     10,         1,               0.948
  2752,      0,      0,         0,               0.998
  2752,      0,      0,         1,               0.993
  2752,     11,      0,         0,                0.96
  2752,     11,      0,         1,                0.96
  2752,     43,      0,         0,               0.979
  2752,     43,      0,         1,               0.979
  2752,      0,     11,         0,               0.939
  2752,      0,     11,         1,               0.939
  2752,      0,     43,         0,                0.93
  2752,      0,     43,         1,                0.93
  2752,     11,     11,         0,               0.949
  2752,     11,     11,         1,               0.949
  2752,     43,     43,         0,               1.007
  2752,     43,     43,         1,               1.007
  2752,   2048,      0,         0,               0.993
  2752,   2048,      0,         1,               0.994
  2752,   2059,      0,         0,                0.96
  2752,   2059,      0,         1,                0.96
  2752,   2048,     11,         0,                0.77
  2752,   2048,     11,         1,                0.77
  2752,   2059,     11,         0,               0.916
  2752,   2059,     11,         1,               0.916
  2752,     11,      1,         0,                 1.0
  2752,     11,      1,         1,                 1.0
  2752,      1,     11,         0,               0.933
  2752,      1,     11,         1,               0.933
  2752,     43,      1,         0,               1.028
  2752,     43,      1,         1,               1.028
  2752,      1,     43,         0,               0.925
  2752,      1,     43,         1,               0.925
  2752,   2059,      1,         0,               0.995
  2752,   2059,      1,         1,               0.995
  2752,   2049,     11,         0,               0.929
  2752,   2049,     11,         1,               0.929
  2816,      0,      0,         0,               1.004
  2816,      0,      0,         1,               1.004
  2816,     12,      0,         0,               0.897
  2816,     12,      0,         1,               0.894
  2816,     44,      0,         0,               0.914
  2816,     44,      0,         1,               0.914
  2816,      0,     12,         0,               0.877
  2816,      0,     12,         1,               0.874
  2816,      0,     44,         0,               0.871
  2816,      0,     44,         1,                0.87
  2816,     12,     12,         0,               0.948
  2816,     12,     12,         1,               0.948
  2816,     44,     44,         0,               1.009
  2816,     44,     44,         1,               1.009
  2816,   2048,      0,         0,               1.005
  2816,   2048,      0,         1,               1.005
  2816,   2060,      0,         0,               0.894
  2816,   2060,      0,         1,               0.894
  2816,   2048,     12,         0,               0.715
  2816,   2048,     12,         1,               0.713
  2816,   2060,     12,         0,               0.915
  2816,   2060,     12,         1,               0.915
  2816,     12,      1,         0,               0.918
  2816,     12,      1,         1,               0.917
  2816,      1,     12,         0,               0.863
  2816,      1,     12,         1,               0.863
  2816,     44,      1,         0,               0.944
  2816,     44,      1,         1,               0.943
  2816,      1,     44,         0,               0.861
  2816,      1,     44,         1,               0.861
  2816,   2060,      1,         0,               0.919
  2816,   2060,      1,         1,               0.924
  2816,   2049,     12,         0,                0.86
  2816,   2049,     12,         1,                0.86
  2880,      0,      0,         0,               0.989
  2880,      0,      0,         1,               0.989
  2880,     13,      0,         0,               0.967
  2880,     13,      0,         1,               0.967
  2880,     45,      0,         0,               0.987
  2880,     45,      0,         1,               0.987
  2880,      0,     13,         0,               0.925
  2880,      0,     13,         1,               0.925
  2880,      0,     45,         0,               0.927
  2880,      0,     45,         1,               0.927
  2880,     13,     13,         0,               0.944
  2880,     13,     13,         1,               0.944
  2880,     45,     45,         0,               1.003
  2880,     45,     45,         1,               1.003
  2880,   2048,      0,         0,               0.989
  2880,   2048,      0,         1,               0.989
  2880,   2061,      0,         0,               0.967
  2880,   2061,      0,         1,               0.967
  2880,   2048,     13,         0,                0.76
  2880,   2048,     13,         1,                0.76
  2880,   2061,     13,         0,                0.91
  2880,   2061,     13,         1,                0.91
  2880,     13,      1,         0,               0.922
  2880,     13,      1,         1,               0.922
  2880,      1,     13,         0,               0.859
  2880,      1,     13,         1,               0.859
  2880,     45,      1,         0,               1.013
  2880,     45,      1,         1,               1.013
  2880,      1,     45,         0,                0.92
  2880,      1,     45,         1,                0.92
  2880,   2061,      1,         0,               0.984
  2880,   2061,      1,         1,               0.984
  2880,   2049,     13,         0,               0.918
  2880,   2049,     13,         1,               0.918
  2944,      0,      0,         0,               1.014
  2944,      0,      0,         1,               1.015
  2944,     14,      0,         0,               0.961
  2944,     14,      0,         1,               0.961
  2944,     46,      0,         0,               0.979
  2944,     46,      0,         1,               0.979
  2944,      0,     14,         0,               0.934
  2944,      0,     14,         1,               0.937
  2944,      0,     46,         0,               0.924
  2944,      0,     46,         1,               0.921
  2944,     14,     14,         0,               0.953
  2944,     14,     14,         1,               0.953
  2944,     46,     46,         0,               1.009
  2944,     46,     46,         1,               1.009
  2944,   2048,      0,         0,               1.015
  2944,   2048,      0,         1,               1.015
  2944,   2062,      0,         0,               0.961
  2944,   2062,      0,         1,               0.961
  2944,   2048,     14,         0,               0.769
  2944,   2048,     14,         1,               0.769
  2944,   2062,     14,         0,               0.923
  2944,   2062,     14,         1,               0.923
  2944,     14,      1,         0,               0.999
  2944,     14,      1,         1,               0.999
  2944,      1,     14,         0,               0.927
  2944,      1,     14,         1,               0.927
  2944,     46,      1,         0,               1.027
  2944,     46,      1,         1,               1.027
  2944,      1,     46,         0,               0.918
  2944,      1,     46,         1,               0.918
  2944,   2062,      1,         0,               0.995
  2944,   2062,      1,         1,               0.995
  2944,   2049,     14,         0,               0.922
  2944,   2049,     14,         1,               0.922
  3008,      0,      0,         0,               0.998
  3008,      0,      0,         1,               0.997
  3008,     15,      0,         0,               0.953
  3008,     15,      0,         1,               0.953
  3008,     47,      0,         0,               0.996
  3008,     47,      0,         1,               0.996
  3008,      0,     15,         0,               0.933
  3008,      0,     15,         1,               0.929
  3008,      0,     47,         0,               0.933
  3008,      0,     47,         1,               0.933
  3008,     15,     15,         0,                0.95
  3008,     15,     15,         1,               0.949
  3008,     47,     47,         0,               1.003
  3008,     47,     47,         1,               1.003
  3008,   2048,      0,         0,               0.998
  3008,   2048,      0,         1,               0.998
  3008,   2063,      0,         0,               0.953
  3008,   2063,      0,         1,               0.953
  3008,   2048,     15,         0,               0.766
  3008,   2048,     15,         1,               0.766
  3008,   2063,     15,         0,               0.916
  3008,   2063,     15,         1,               0.916
  3008,     15,      1,         0,               0.996
  3008,     15,      1,         1,               0.996
  3008,      1,     15,         0,               0.927
  3008,      1,     15,         1,               0.927
  3008,     47,      1,         0,               1.026
  3008,     47,      1,         1,               1.026
  3008,      1,     47,         0,               0.918
  3008,      1,     47,         1,               0.918
  3008,   2063,      1,         0,               0.994
  3008,   2063,      1,         1,               0.994
  3008,   2049,     15,         0,               0.925
  3008,   2049,     15,         1,               0.925
  3072,      0,      0,         0,               1.015
  3072,      0,      0,         1,               1.016
  3072,     16,      0,         0,               1.045
  3072,     16,      0,         1,               1.045
  3072,     48,      0,         0,               1.045
  3072,     48,      0,         1,               1.045
  3072,      0,     16,         0,               1.049
  3072,      0,     16,         1,               1.049
  3072,      0,     48,         0,               1.049
  3072,      0,     48,         1,               1.049
  3072,     16,     16,         0,               1.016
  3072,     16,     16,         1,               1.015
  3072,     48,     48,         0,               1.015
  3072,     48,     48,         1,               1.016
  3072,   2048,      0,         0,               1.016
  3072,   2048,      0,         1,               1.016
  3072,   2064,      0,         0,               1.045
  3072,   2064,      0,         1,               1.045
  3072,   2048,     16,         0,               1.049
  3072,   2048,     16,         1,               1.049
  3072,   2064,     16,         0,               1.016
  3072,   2064,     16,         1,               1.016
  3072,     16,      1,         0,               0.815
  3072,     16,      1,         1,               0.815
  3072,      1,     16,         0,               0.872
  3072,      1,     16,         1,               0.872
  3072,     48,      1,         0,               1.017
  3072,     48,      1,         1,               1.017
  3072,      1,     48,         0,               0.872
  3072,      1,     48,         1,               0.872
  3072,   2064,      1,         0,               0.815
  3072,   2064,      1,         1,               0.815
  3072,   2049,     16,         0,               0.872
  3072,   2049,     16,         1,               0.872
  3136,      0,      0,         0,               0.995
  3136,      0,      0,         1,               0.996
  3136,     17,      0,         0,               0.949
  3136,     17,      0,         1,               0.949
  3136,     49,      0,         0,               0.987
  3136,     49,      0,         1,               0.987
  3136,      0,     17,         0,               0.922
  3136,      0,     17,         1,               0.919
  3136,      0,     49,         0,               0.931
  3136,      0,     49,         1,               0.931
  3136,     17,     17,         0,               1.122
  3136,     17,     17,         1,               1.119
  3136,     49,     49,         0,               0.987
  3136,     49,     49,         1,               0.987
  3136,   2048,      0,         0,               0.997
  3136,   2048,      0,         1,               0.997
  3136,   2065,      0,         0,               0.949
  3136,   2065,      0,         1,               0.949
  3136,   2048,     17,         0,               0.896
  3136,   2048,     17,         1,               0.896
  3136,   2065,     17,         0,               1.122
  3136,   2065,     17,         1,                1.12
  3136,     17,      1,         0,               1.185
  3136,     17,      1,         1,               1.185
  3136,      1,     17,         0,               1.124
  3136,      1,     17,         1,               1.124
  3136,     49,      1,         0,                1.11
  3136,     49,      1,         1,               1.109
  3136,      1,     49,         0,               1.044
  3136,      1,     49,         1,               1.044
  3136,   2065,      1,         0,               1.147
  3136,   2065,      1,         1,               1.147
  3136,   2049,     17,         0,               1.103
  3136,   2049,     17,         1,               1.103
  3200,      0,      0,         0,               1.006
  3200,      0,      0,         1,               1.006
  3200,     18,      0,         0,               0.978
  3200,     18,      0,         1,               0.978
  3200,     50,      0,         0,               0.998
  3200,     50,      0,         1,               0.998
  3200,      0,     18,         0,               0.932
  3200,      0,     18,         1,               0.932
  3200,      0,     50,         0,                0.93
  3200,      0,     50,         1,                0.93
  3200,     18,     18,         0,                1.11
  3200,     18,     18,         1,                1.11
  3200,     50,     50,         0,               0.994
  3200,     50,     50,         1,               0.994
  3200,   2048,      0,         0,               1.007
  3200,   2048,      0,         1,               1.007
  3200,   2066,      0,         0,               0.978
  3200,   2066,      0,         1,               0.978
  3200,   2048,     18,         0,               0.894
  3200,   2048,     18,         1,               0.894
  3200,   2066,     18,         0,                1.11
  3200,   2066,     18,         1,                1.11
  3200,     18,      1,         0,               1.002
  3200,     18,      1,         1,               1.002
  3200,      1,     18,         0,               0.917
  3200,      1,     18,         1,               0.917
  3200,     50,      1,         0,               0.963
  3200,     50,      1,         1,               0.964
  3200,      1,     50,         0,               0.888
  3200,      1,     50,         1,               0.888
  3200,   2066,      1,         0,               1.002
  3200,   2066,      1,         1,               1.002
  3200,   2049,     18,         0,               0.914
  3200,   2049,     18,         1,               0.914
  3264,      0,      0,         0,               0.994
  3264,      0,      0,         1,               0.994
  3264,     19,      0,         0,               0.959
  3264,     19,      0,         1,               0.959
  3264,     51,      0,         0,               0.994
  3264,     51,      0,         1,               0.994
  3264,      0,     19,         0,               0.927
  3264,      0,     19,         1,               0.927
  3264,      0,     51,         0,               0.927
  3264,      0,     51,         1,               0.927
  3264,     19,     19,         0,                 1.1
  3264,     19,     19,         1,               1.099
  3264,     51,     51,         0,               0.982
  3264,     51,     51,         1,               0.982
  3264,   2048,      0,         0,               0.994
  3264,   2048,      0,         1,               0.994
  3264,   2067,      0,         0,               0.959
  3264,   2067,      0,         1,               0.959
  3264,   2048,     19,         0,               0.891
  3264,   2048,     19,         1,               0.891
  3264,   2067,     19,         0,               1.099
  3264,   2067,     19,         1,               1.099
  3264,     19,      1,         0,               0.977
  3264,     19,      1,         1,               0.976
  3264,      1,     19,         0,               0.921
  3264,      1,     19,         1,               0.921
  3264,     51,      1,         0,               0.959
  3264,     51,      1,         1,               0.959
  3264,      1,     51,         0,               0.886
  3264,      1,     51,         1,               0.886
  3264,   2067,      1,         0,               0.976
  3264,   2067,      1,         1,               0.976
  3264,   2049,     19,         0,               0.917
  3264,   2049,     19,         1,               0.917
  3328,      0,      0,         0,               0.997
  3328,      0,      0,         1,               0.993
  3328,     20,      0,         0,               0.955
  3328,     20,      0,         1,               0.955
  3328,     52,      0,         0,                0.99
  3328,     52,      0,         1,                0.99
  3328,      0,     20,         0,               0.925
  3328,      0,     20,         1,               0.927
  3328,      0,     52,         0,               0.933
  3328,      0,     52,         1,               0.933
  3328,     20,     20,         0,                1.11
  3328,     20,     20,         1,                1.11
  3328,     52,     52,         0,               0.988
  3328,     52,     52,         1,               0.988
  3328,   2048,      0,         0,               0.996
  3328,   2048,      0,         1,               0.993
  3328,   2068,      0,         0,               0.955
  3328,   2068,      0,         1,               0.955
  3328,   2048,     20,         0,                 0.9
  3328,   2048,     20,         1,                 0.9
  3328,   2068,     20,         0,               1.109
  3328,   2068,     20,         1,               1.109
  3328,     20,      1,         0,               0.996
  3328,     20,      1,         1,               0.996
  3328,      1,     20,         0,               0.927
  3328,      1,     20,         1,               0.927
  3328,     52,      1,         0,               0.972
  3328,     52,      1,         1,               0.972
  3328,      1,     52,         0,               0.901
  3328,      1,     52,         1,               0.901
  3328,   2068,      1,         0,               0.996
  3328,   2068,      1,         1,               0.996
  3328,   2049,     20,         0,               0.924
  3328,   2049,     20,         1,               0.924
  3392,      0,      0,         0,               0.996
  3392,      0,      0,         1,                 1.0
  3392,     21,      0,         0,               0.964
  3392,     21,      0,         1,               0.964
  3392,     53,      0,         0,               0.999
  3392,     53,      0,         1,               0.999
  3392,      0,     21,         0,               0.932
  3392,      0,     21,         1,               0.932
  3392,      0,     53,         0,                0.93
  3392,      0,     53,         1,                0.93
  3392,     21,     21,         0,               1.113
  3392,     21,     21,         1,               1.113
  3392,     53,     53,         0,               0.983
  3392,     53,     53,         1,               0.983
  3392,   2048,      0,         0,                 1.0
  3392,   2048,      0,         1,                 1.0
  3392,   2069,      0,         0,               0.964
  3392,   2069,      0,         1,               0.964
  3392,   2048,     21,         0,               0.896
  3392,   2048,     21,         1,               0.896
  3392,   2069,     21,         0,               1.113
  3392,   2069,     21,         1,               1.113
  3392,     21,      1,         0,               0.994
  3392,     21,      1,         1,               0.994
  3392,      1,     21,         0,               0.918
  3392,      1,     21,         1,               0.918
  3392,     53,      1,         0,               0.972
  3392,     53,      1,         1,               0.972
  3392,      1,     53,         0,               0.891
  3392,      1,     53,         1,               0.891
  3392,   2069,      1,         0,               0.994
  3392,   2069,      1,         1,               0.994
  3392,   2049,     21,         0,               0.915
  3392,   2049,     21,         1,               0.915
  3456,      0,      0,         0,               0.995
  3456,      0,      0,         1,               0.995
  3456,     22,      0,         0,               0.965
  3456,     22,      0,         1,               0.965
  3456,     54,      0,         0,               0.996
  3456,     54,      0,         1,               0.996
  3456,      0,     22,         0,               0.927
  3456,      0,     22,         1,               0.927
  3456,      0,     54,         0,               0.927
  3456,      0,     54,         1,               0.927
  3456,     22,     22,         0,               1.106
  3456,     22,     22,         1,               1.107
  3456,     54,     54,         0,                0.98
  3456,     54,     54,         1,                0.98
  3456,   2048,      0,         0,               0.995
  3456,   2048,      0,         1,               0.995
  3456,   2070,      0,         0,               0.965
  3456,   2070,      0,         1,               0.965
  3456,   2048,     22,         0,               0.893
  3456,   2048,     22,         1,               0.893
  3456,   2070,     22,         0,               1.107
  3456,   2070,     22,         1,               1.107
  3456,     22,      1,         0,               0.988
  3456,     22,      1,         1,               0.988
  3456,      1,     22,         0,               0.915
  3456,      1,     22,         1,               0.915
  3456,     54,      1,         0,               0.963
  3456,     54,      1,         1,               0.963
  3456,      1,     54,         0,               0.887
  3456,      1,     54,         1,               0.887
  3456,   2070,      1,         0,               0.988
  3456,   2070,      1,         1,               0.988
  3456,   2049,     22,         0,               0.911
  3456,   2049,     22,         1,               0.911
  3520,      0,      0,         0,               1.016
  3520,      0,      0,         1,               1.016
  3520,     23,      0,         0,               0.957
  3520,     23,      0,         1,               0.957
  3520,     55,      0,         0,               0.991
  3520,     55,      0,         1,               0.991
  3520,      0,     23,         0,               0.918
  3520,      0,     23,         1,               0.929
  3520,      0,     55,         0,               0.935
  3520,      0,     55,         1,               0.934
  3520,     23,     23,         0,               1.111
  3520,     23,     23,         1,               1.111
  3520,     55,     55,         0,               0.994
  3520,     55,     55,         1,               0.994
  3520,   2048,      0,         0,               1.016
  3520,   2048,      0,         1,               1.016
  3520,   2071,      0,         0,               0.957
  3520,   2071,      0,         1,               0.957
  3520,   2048,     23,         0,               0.903
  3520,   2048,     23,         1,               0.902
  3520,   2071,     23,         0,               1.111
  3520,   2071,     23,         1,               1.111
  3520,     23,      1,         0,               0.997
  3520,     23,      1,         1,               0.997
  3520,      1,     23,         0,               0.926
  3520,      1,     23,         1,               0.927
  3520,     55,      1,         0,               0.976
  3520,     55,      1,         1,               0.976
  3520,      1,     55,         0,               0.902
  3520,      1,     55,         1,               0.902
  3520,   2071,      1,         0,               0.997
  3520,   2071,      1,         1,               0.997
  3520,   2049,     23,         0,               0.924
  3520,   2049,     23,         1,               0.924
  3584,      0,      0,         0,               1.005
  3584,      0,      0,         1,               1.004
  3584,     24,      0,         0,               0.985
  3584,     24,      0,         1,               0.979
  3584,     56,      0,         0,               1.006
  3584,     56,      0,         1,               1.006
  3584,      0,     24,         0,               0.931
  3584,      0,     24,         1,               0.931
  3584,      0,     56,         0,                0.93
  3584,      0,     56,         1,                0.93
  3584,     24,     24,         0,               1.111
  3584,     24,     24,         1,                1.11
  3584,     56,     56,         0,               1.102
  3584,     56,     56,         1,               1.101
  3584,   2048,      0,         0,               1.006
  3584,   2048,      0,         1,               1.005
  3584,   2072,      0,         0,               0.983
  3584,   2072,      0,         1,               0.977
  3584,   2048,     24,         0,               0.896
  3584,   2048,     24,         1,               0.897
  3584,   2072,     24,         0,               1.111
  3584,   2072,     24,         1,               1.111
  3584,     24,      1,         0,               1.004
  3584,     24,      1,         1,               1.004
  3584,      1,     24,         0,               0.921
  3584,      1,     24,         1,               0.921
  3584,     56,      1,         0,                0.97
  3584,     56,      1,         1,                0.97
  3584,      1,     56,         0,               0.891
  3584,      1,     56,         1,               0.891
  3584,   2072,      1,         0,               1.004
  3584,   2072,      1,         1,               1.004
  3584,   2049,     24,         0,               0.918
  3584,   2049,     24,         1,               0.918
  3648,      0,      0,         0,               1.012
  3648,      0,      0,         1,               1.012
  3648,     25,      0,         0,                0.96
  3648,     25,      0,         1,                0.96
  3648,     57,      0,         0,               0.988
  3648,     57,      0,         1,               0.988
  3648,      0,     25,         0,               0.927
  3648,      0,     25,         1,               0.927
  3648,      0,     57,         0,               0.927
  3648,      0,     57,         1,               0.927
  3648,     25,     25,         0,                 1.1
  3648,     25,     25,         1,                 1.1
  3648,     57,     57,         0,               0.986
  3648,     57,     57,         1,               0.986
  3648,   2048,      0,         0,               1.012
  3648,   2048,      0,         1,               1.012
  3648,   2073,      0,         0,                0.96
  3648,   2073,      0,         1,                0.96
  3648,   2048,     25,         0,               0.895
  3648,   2048,     25,         1,               0.894
  3648,   2073,     25,         0,               1.103
  3648,   2073,     25,         1,               1.103
  3648,     25,      1,         0,               1.032
  3648,     25,      1,         1,               1.032
  3648,      1,     25,         0,                 0.9
  3648,      1,     25,         1,               0.901
  3648,     57,      1,         0,               0.974
  3648,     57,      1,         1,               0.974
  3648,      1,     57,         0,               0.888
  3648,      1,     57,         1,               0.888
  3648,   2073,      1,         0,               1.032
  3648,   2073,      1,         1,               1.032
  3648,   2049,     25,         0,               0.895
  3648,   2049,     25,         1,               0.896
  3712,      0,      0,         0,               0.996
  3712,      0,      0,         1,               0.996
  3712,     26,      0,         0,               0.959
  3712,     26,      0,         1,               0.959
  3712,     58,      0,         0,               0.995
  3712,     58,      0,         1,               0.995
  3712,      0,     26,         0,                0.92
  3712,      0,     26,         1,               0.919
  3712,      0,     58,         0,               0.931
  3712,      0,     58,         1,               0.931
  3712,     26,     26,         0,               1.103
  3712,     26,     26,         1,               1.101
  3712,     58,     58,         0,                0.99
  3712,     58,     58,         1,               0.989
  3712,   2048,      0,         0,               0.997
  3712,   2048,      0,         1,               0.997
  3712,   2074,      0,         0,               0.959
  3712,   2074,      0,         1,               0.959
  3712,   2048,     26,         0,               0.901
  3712,   2048,     26,         1,               0.901
  3712,   2074,     26,         0,               1.103
  3712,   2074,     26,         1,               1.103
  3712,     26,      1,         0,               1.001
  3712,     26,      1,         1,               1.001
  3712,      1,     26,         0,               0.928
  3712,      1,     26,         1,               0.928
  3712,     58,      1,         0,               0.974
  3712,     58,      1,         1,               0.974
  3712,      1,     58,         0,               0.903
  3712,      1,     58,         1,               0.902
  3712,   2074,      1,         0,               1.001
  3712,   2074,      1,         1,               1.001
  3712,   2049,     26,         0,               0.925
  3712,   2049,     26,         1,               0.925
  3776,      0,      0,         0,               1.003
  3776,      0,      0,         1,               1.003
  3776,     27,      0,         0,               0.964
  3776,     27,      0,         1,               0.963
  3776,     59,      0,         0,               1.004
  3776,     59,      0,         1,               1.004
  3776,      0,     27,         0,               0.931
  3776,      0,     27,         1,               0.931
  3776,      0,     59,         0,               0.929
  3776,      0,     59,         1,               0.929
  3776,     27,     27,         0,               1.097
  3776,     27,     27,         1,               1.097
  3776,     59,     59,         0,               0.992
  3776,     59,     59,         1,               0.992
  3776,   2048,      0,         0,               1.003
  3776,   2048,      0,         1,               1.003
  3776,   2075,      0,         0,               0.964
  3776,   2075,      0,         1,               0.963
  3776,   2048,     27,         0,               0.898
  3776,   2048,     27,         1,               0.898
  3776,   2075,     27,         0,               1.097
  3776,   2075,     27,         1,               1.097
  3776,     27,      1,         0,               0.991
  3776,     27,      1,         1,               0.991
  3776,      1,     27,         0,               0.919
  3776,      1,     27,         1,               0.919
  3776,     59,      1,         0,               0.979
  3776,     59,      1,         1,               0.979
  3776,      1,     59,         0,               0.894
  3776,      1,     59,         1,               0.894
  3776,   2075,      1,         0,               0.991
  3776,   2075,      1,         1,               0.991
  3776,   2049,     27,         0,               0.916
  3776,   2049,     27,         1,               0.917
  3840,      0,      0,         0,               0.998
  3840,      0,      0,         1,               0.998
  3840,     28,      0,         0,               0.968
  3840,     28,      0,         1,               0.968
  3840,     60,      0,         0,               1.001
  3840,     60,      0,         1,               1.001
  3840,      0,     28,         0,               0.927
  3840,      0,     28,         1,               0.927
  3840,      0,     60,         0,               0.927
  3840,      0,     60,         1,               0.927
  3840,     28,     28,         0,               1.094
  3840,     28,     28,         1,               1.094
  3840,     60,     60,         0,               0.982
  3840,     60,     60,         1,               0.982
  3840,   2048,      0,         0,               0.998
  3840,   2048,      0,         1,               0.998
  3840,   2076,      0,         0,               0.968
  3840,   2076,      0,         1,               0.968
  3840,   2048,     28,         0,               0.896
  3840,   2048,     28,         1,               0.896
  3840,   2076,     28,         0,               1.094
  3840,   2076,     28,         1,               1.094
  3840,     28,      1,         0,                0.99
  3840,     28,      1,         1,                0.99
  3840,      1,     28,         0,                0.91
  3840,      1,     28,         1,                0.91
  3840,     60,      1,         0,               0.969
  3840,     60,      1,         1,               0.969
  3840,      1,     60,         0,                0.89
  3840,      1,     60,         1,               0.891
  3840,   2076,      1,         0,                0.99
  3840,   2076,      1,         1,                0.99
  3840,   2049,     28,         0,               0.906
  3840,   2049,     28,         1,               0.906
  3904,      0,      0,         0,               1.001
  3904,      0,      0,         1,               0.998
  3904,     29,      0,         0,               0.961
  3904,     29,      0,         1,               0.961
  3904,     61,      0,         0,               0.997
  3904,     61,      0,         1,               0.997
  3904,      0,     29,         0,                0.92
  3904,      0,     29,         1,               0.926
  3904,      0,     61,         0,               0.933
  3904,      0,     61,         1,               0.933
  3904,     29,     29,         0,               1.103
  3904,     29,     29,         1,               1.103
  3904,     61,     61,         0,               0.995
  3904,     61,     61,         1,               0.995
  3904,   2048,      0,         0,               0.998
  3904,   2048,      0,         1,               0.998
  3904,   2077,      0,         0,               0.961
  3904,   2077,      0,         1,               0.961
  3904,   2048,     29,         0,               0.904
  3904,   2048,     29,         1,               0.904
  3904,   2077,     29,         0,               1.102
  3904,   2077,     29,         1,               1.102
  3904,     29,      1,         0,                 1.0
  3904,     29,      1,         1,                 1.0
  3904,      1,     29,         0,               0.911
  3904,      1,     29,         1,               0.911
  3904,     61,      1,         0,                0.98
  3904,     61,      1,         1,                0.98
  3904,      1,     61,         0,               0.904
  3904,      1,     61,         1,               0.904
  3904,   2077,      1,         0,                 1.0
  3904,   2077,      1,         1,                 1.0
  3904,   2049,     29,         0,               0.906
  3904,   2049,     29,         1,               0.907
  3968,      0,      0,         0,               1.003
  3968,      0,      0,         1,               1.003
  3968,     30,      0,         0,               0.969
  3968,     30,      0,         1,               0.969
  3968,     62,      0,         0,               1.005
  3968,     62,      0,         1,               1.006
  3968,      0,     30,         0,               0.931
  3968,      0,     30,         1,               0.931
  3968,      0,     62,         0,                0.93
  3968,      0,     62,         1,                0.93
  3968,     30,     30,         0,               1.103
  3968,     30,     30,         1,               1.103
  3968,     62,     62,         0,                0.99
  3968,     62,     62,         1,                0.99
  3968,   2048,      0,         0,               1.004
  3968,   2048,      0,         1,               1.004
  3968,   2078,      0,         0,               0.968
  3968,   2078,      0,         1,               0.969
  3968,   2048,     30,         0,               0.899
  3968,   2048,     30,         1,               0.899
  3968,   2078,     30,         0,               1.105
  3968,   2078,     30,         1,               1.105
  3968,     30,      1,         0,               0.993
  3968,     30,      1,         1,               0.993
  3968,      1,     30,         0,               0.914
  3968,      1,     30,         1,               0.913
  3968,     62,      1,         0,               0.978
  3968,     62,      1,         1,               0.978
  3968,      1,     62,         0,               0.895
  3968,      1,     62,         1,               0.895
  3968,   2078,      1,         0,               0.993
  3968,   2078,      1,         1,               0.993
  3968,   2049,     30,         0,               0.911
  3968,   2049,     30,         1,               0.911
  4032,      0,      0,         0,               0.995
  4032,      0,      0,         1,               0.995
  4032,     31,      0,         0,               0.967
  4032,     31,      0,         1,               0.967
  4032,     63,      0,         0,               1.003
  4032,     63,      0,         1,               1.002
  4032,      0,     31,         0,               0.927
  4032,      0,     31,         1,               0.927
  4032,      0,     63,         0,               0.927
  4032,      0,     63,         1,               0.927
  4032,     31,     31,         0,                1.09
  4032,     31,     31,         1,                1.09
  4032,     63,     63,         0,               0.987
  4032,     63,     63,         1,               0.987
  4032,   2048,      0,         0,               0.995
  4032,   2048,      0,         1,               0.995
  4032,   2079,      0,         0,               0.967
  4032,   2079,      0,         1,               0.967
  4032,   2048,     31,         0,               0.897
  4032,   2048,     31,         1,               0.897
  4032,   2079,     31,         0,                1.09
  4032,   2079,     31,         1,                1.09
  4032,     31,      1,         0,               0.989
  4032,     31,      1,         1,               0.989
  4032,      1,     31,         0,               0.922
  4032,      1,     31,         1,               0.923
  4032,     63,      1,         0,               0.971
  4032,     63,      1,         1,               0.972
  4032,      1,     63,         0,               0.892
  4032,      1,     63,         1,               0.892
  4032,   2079,      1,         0,               0.988
  4032,   2079,      1,         1,               0.988
  4032,   2049,     31,         0,               0.919
  4032,   2049,     31,         1,               0.919
  4096,     32,      0,         0,               1.014
  4096,     32,      0,         1,               1.014
  4096,     64,      0,         0,               1.014
  4096,     64,      0,         1,               1.014
  4096,      0,     32,         0,               1.013
  4096,      0,     32,         1,               1.013
  4096,      0,     64,         0,               1.013
  4096,      0,     64,         1,               1.013
  4096,     32,     32,         0,               1.014
  4096,     32,     32,         1,               1.014
  4096,     64,     64,         0,               1.014
  4096,     64,     64,         1,               1.014
  4096,   2080,      0,         0,               1.014
  4096,   2080,      0,         1,               1.014
  4096,   2048,     32,         0,               1.014
  4096,   2048,     32,         1,               1.014
  4096,   2080,     32,         0,               1.014
  4096,   2080,     32,         1,               1.014
  4096,     32,      1,         0,               0.975
  4096,     32,      1,         1,               0.975
  4096,      1,     32,         0,               0.769
  4096,      1,     32,         1,               0.769
  4096,     64,      1,         0,               0.858
  4096,     64,      1,         1,               0.858
  4096,      1,     64,         0,               0.769
  4096,      1,     64,         1,               0.769
  4096,   2080,      1,         0,               0.829
  4096,   2080,      1,         1,               0.829
  4096,   2049,     32,         0,               0.886
  4096,   2049,     32,         1,               0.886
  4160,      0,      0,         0,               1.003
  4160,      0,      0,         1,               1.003
  4160,     33,      0,         0,               1.004
  4160,     33,      0,         1,               1.004
  4160,     65,      0,         0,               0.999
  4160,     65,      0,         1,               0.999
  4160,      0,     33,         0,               0.931
  4160,      0,     33,         1,               0.931
  4160,      0,     65,         0,               0.765
  4160,      0,     65,         1,               0.765
  4160,     33,     33,         0,               0.998
  4160,     33,     33,         1,               0.998
  4160,     65,     65,         0,               0.942
  4160,     65,     65,         1,               0.942
  4160,   2048,      0,         0,               1.003
  4160,   2048,      0,         1,               1.003
  4160,   2081,      0,         0,               1.005
  4160,   2081,      0,         1,               1.005
  4160,   2048,     33,         0,               0.899
  4160,   2048,     33,         1,               0.899
  4160,   2081,     33,         0,               1.002
  4160,   2081,     33,         1,               1.002
  4160,     33,      1,         0,               1.114
  4160,     33,      1,         1,               1.114
  4160,      1,     33,         0,                1.01
  4160,      1,     33,         1,                1.01
  4160,     65,      1,         0,               1.077
  4160,     65,      1,         1,               1.077
  4160,      1,     65,         0,               0.935
  4160,      1,     65,         1,               0.936
  4160,   2081,      1,         0,               1.077
  4160,   2081,      1,         1,               1.077
  4160,   2049,     33,         0,               1.008
  4160,   2049,     33,         1,               1.007
  4224,      0,      0,         0,               1.014
  4224,      0,      0,         1,               1.014
  4224,     34,      0,         0,                 1.0
  4224,     34,      0,         1,                 1.0
  4224,     66,      0,         0,               1.001
  4224,     66,      0,         1,               1.001
  4224,      0,     34,         0,               0.928
  4224,      0,     34,         1,               0.928
  4224,      0,     66,         0,               0.762
  4224,      0,     66,         1,               0.762
  4224,     34,     34,         0,               0.998
  4224,     34,     34,         1,               0.998
  4224,     66,     66,         0,               0.959
  4224,     66,     66,         1,               0.959
  4224,   2048,      0,         0,               1.014
  4224,   2048,      0,         1,               1.014
  4224,   2082,      0,         0,               1.001
  4224,   2082,      0,         1,               1.001
  4224,   2048,     34,         0,               0.899
  4224,   2048,     34,         1,               0.898
  4224,   2082,     34,         0,               0.998
  4224,   2082,     34,         1,               0.997
  4224,     34,      1,         0,               1.024
  4224,     34,      1,         1,               1.024
  4224,      1,     34,         0,               0.923
  4224,      1,     34,         1,               0.923
  4224,     66,      1,         0,               1.013
  4224,     66,      1,         1,               1.013
  4224,      1,     66,         0,               0.917
  4224,      1,     66,         1,               0.917
  4224,   2082,      1,         0,               1.022
  4224,   2082,      1,         1,               1.022
  4224,   2049,     34,         0,                0.92
  4224,   2049,     34,         1,                0.92
  4288,      0,      0,         0,               0.999
  4288,      0,      0,         1,               0.999
  4288,     35,      0,         0,               0.995
  4288,     35,      0,         1,               0.996
  4288,     67,      0,         0,               0.998
  4288,     67,      0,         1,               0.998
  4288,      0,     35,         0,               0.917
  4288,      0,     35,         1,               0.919
  4288,      0,     67,         0,               0.767
  4288,      0,     67,         1,               0.767
  4288,     35,     35,         0,               1.004
  4288,     35,     35,         1,               1.004
  4288,     67,     67,         0,               0.995
  4288,     67,     67,         1,               0.995
  4288,   2048,      0,         0,               0.999
  4288,   2048,      0,         1,               0.999
  4288,   2083,      0,         0,               0.995
  4288,   2083,      0,         1,               0.995
  4288,   2048,     35,         0,               0.905
  4288,   2048,     35,         1,               0.904
  4288,   2083,     35,         0,               1.004
  4288,   2083,     35,         1,               1.004
  4288,     35,      1,         0,               1.032
  4288,     35,      1,         1,               1.033
  4288,      1,     35,         0,               0.928
  4288,      1,     35,         1,               0.928
  4288,     67,      1,         0,               1.019
  4288,     67,      1,         1,               1.019
  4288,      1,     67,         0,               0.924
  4288,      1,     67,         1,               0.924
  4288,   2083,      1,         0,                1.03
  4288,   2083,      1,         1,               1.031
  4288,   2049,     35,         0,               0.925
  4288,   2049,     35,         1,               0.925
  4352,      0,      0,         0,               1.005
  4352,      0,      0,         1,               1.006
  4352,     36,      0,         0,               1.006
  4352,     36,      0,         1,               1.007
  4352,     68,      0,         0,               1.006
  4352,     68,      0,         1,               1.007
  4352,      0,     36,         0,               0.929
  4352,      0,     36,         1,               0.928
  4352,      0,     68,         0,               0.766
  4352,      0,     68,         1,               0.765
  4352,     36,     36,         0,               0.998
  4352,     36,     36,         1,               0.998
  4352,     68,     68,         0,               0.964
  4352,     68,     68,         1,               0.964
  4352,   2048,      0,         0,               1.006
  4352,   2048,      0,         1,               1.006
  4352,   2084,      0,         0,               1.007
  4352,   2084,      0,         1,               1.007
  4352,   2048,     36,         0,               0.897
  4352,   2048,     36,         1,               0.898
  4352,   2084,     36,         0,               0.998
  4352,   2084,     36,         1,               0.998
  4352,     36,      1,         0,               1.031
  4352,     36,      1,         1,               1.031
  4352,      1,     36,         0,               0.924
  4352,      1,     36,         1,               0.925
  4352,     68,      1,         0,               0.999
  4352,     68,      1,         1,               0.999
  4352,      1,     68,         0,               0.922
  4352,      1,     68,         1,               0.922
  4352,   2084,      1,         0,               1.032
  4352,   2084,      1,         1,                1.03
  4352,   2049,     36,         0,               0.923
  4352,   2049,     36,         1,               0.923
  4416,      0,      0,         0,               0.997
  4416,      0,      0,         1,               0.997
  4416,     37,      0,         0,               1.001
  4416,     37,      0,         1,               1.002
  4416,     69,      0,         0,               1.004
  4416,     69,      0,         1,               1.003
  4416,      0,     37,         0,               0.928
  4416,      0,     37,         1,               0.927
  4416,      0,     69,         0,               0.762
  4416,      0,     69,         1,               0.763
  4416,     37,     37,         0,               0.994
  4416,     37,     37,         1,               0.994
  4416,     69,     69,         0,               0.959
  4416,     69,     69,         1,               0.959
  4416,   2048,      0,         0,               0.997
  4416,   2048,      0,         1,               0.997
  4416,   2085,      0,         0,               1.002
  4416,   2085,      0,         1,               1.001
  4416,   2048,     37,         0,                 0.9
  4416,   2048,     37,         1,                 0.9
  4416,   2085,     37,         0,               0.994
  4416,   2085,     37,         1,               0.994
  4416,     37,      1,         0,               1.024
  4416,     37,      1,         1,               1.025
  4416,      1,     37,         0,               0.922
  4416,      1,     37,         1,               0.922
  4416,     69,      1,         0,               1.008
  4416,     69,      1,         1,               1.009
  4416,      1,     69,         0,               0.913
  4416,      1,     69,         1,               0.912
  4416,   2085,      1,         0,               1.025
  4416,   2085,      1,         1,               1.024
  4416,   2049,     37,         0,                0.92
  4416,   2049,     37,         1,               0.919
  4480,      0,      0,         0,                 1.0
  4480,      0,      0,         1,               0.998
  4480,     38,      0,         0,               0.996
  4480,     38,      0,         1,               0.996
  4480,     70,      0,         0,               0.992
  4480,     70,      0,         1,               0.992
  4480,      0,     38,         0,               0.919
  4480,      0,     38,         1,               0.916
  4480,      0,     70,         0,               0.767
  4480,      0,     70,         1,               0.767
  4480,     38,     38,         0,               1.002
  4480,     38,     38,         1,               1.002
  4480,     70,     70,         0,               0.963
  4480,     70,     70,         1,               0.963
  4480,   2048,      0,         0,               0.998
  4480,   2048,      0,         1,               0.998
  4480,   2086,      0,         0,               0.996
  4480,   2086,      0,         1,               0.996
  4480,   2048,     38,         0,               0.907
  4480,   2048,     38,         1,               0.907
  4480,   2086,     38,         0,               1.002
  4480,   2086,     38,         1,               1.002
  4480,     38,      1,         0,               1.023
  4480,     38,      1,         1,               1.024
  4480,      1,     38,         0,               0.914
  4480,      1,     38,         1,               0.913
  4480,     70,      1,         0,                1.01
  4480,     70,      1,         1,               1.011
  4480,      1,     70,         0,               0.922
  4480,      1,     70,         1,               0.922
  4480,   2086,      1,         0,               1.024
  4480,   2086,      1,         1,               1.024
  4480,   2049,     38,         0,               0.911
  4480,   2049,     38,         1,                0.91
  4544,      0,      0,         0,               1.002
  4544,      0,      0,         1,               1.002
  4544,     39,      0,         0,               1.007
  4544,     39,      0,         1,               1.007
  4544,     71,      0,         0,                1.01
  4544,     71,      0,         1,               1.008
  4544,      0,     39,         0,                0.93
  4544,      0,     39,         1,                0.93
  4544,      0,     71,         0,               0.766
  4544,      0,     71,         1,               0.766
  4544,     39,     39,         0,               1.001
  4544,     39,     39,         1,               1.001
  4544,     71,     71,         0,               0.966
  4544,     71,     71,         1,               0.966
  4544,   2048,      0,         0,               1.002
  4544,   2048,      0,         1,               1.002
  4544,   2087,      0,         0,               1.008
  4544,   2087,      0,         1,               1.008
  4544,   2048,     39,         0,               0.901
  4544,   2048,     39,         1,               0.902
  4544,   2087,     39,         0,               1.001
  4544,   2087,     39,         1,               1.001
  4544,     39,      1,         0,               1.032
  4544,     39,      1,         1,               1.032
  4544,      1,     39,         0,               0.925
  4544,      1,     39,         1,               0.925
  4544,     71,      1,         0,               0.997
  4544,     71,      1,         1,               0.998
  4544,      1,     71,         0,               0.921
  4544,      1,     71,         1,               0.922
  4544,   2087,      1,         0,               1.032
  4544,   2087,      1,         1,               1.032
  4544,   2049,     39,         0,               0.924
  4544,   2049,     39,         1,               0.923
  4608,      0,      0,         0,               0.999
  4608,      0,      0,         1,               0.998
  4608,     40,      0,         0,               1.013
  4608,     40,      0,         1,               1.012
  4608,     72,      0,         0,               1.013
  4608,     72,      0,         1,               1.013
  4608,      0,     40,         0,               0.925
  4608,      0,     40,         1,               0.926
  4608,      0,     72,         0,               0.765
  4608,      0,     72,         1,               0.765
  4608,     40,     40,         0,               1.085
  4608,     40,     40,         1,               1.086
  4608,     72,     72,         0,               0.966
  4608,     72,     72,         1,               0.966
  4608,   2048,      0,         0,               0.999
  4608,   2048,      0,         1,               0.999
  4608,   2088,      0,         0,               1.012
  4608,   2088,      0,         1,               1.013
  4608,   2048,     40,         0,               0.898
  4608,   2048,     40,         1,               0.898
  4608,   2088,     40,         0,               1.087
  4608,   2088,     40,         1,               1.087
  4608,     40,      1,         0,               1.006
  4608,     40,      1,         1,               1.007
  4608,      1,     40,         0,               0.919
  4608,      1,     40,         1,               0.919
  4608,     72,      1,         0,               1.012
  4608,     72,      1,         1,               1.012
  4608,      1,     72,         0,               0.914
  4608,      1,     72,         1,               0.914
  4608,   2088,      1,         0,               1.006
  4608,   2088,      1,         1,               1.007
  4608,   2049,     40,         0,               0.916
  4608,   2049,     40,         1,               0.916
  4672,      0,      0,         0,               1.014
  4672,      0,      0,         1,               1.014
  4672,     41,      0,         0,               1.002
  4672,     41,      0,         1,               1.002
  4672,     73,      0,         0,               0.976
  4672,     73,      0,         1,               0.975
  4672,      0,     41,         0,               0.919
  4672,      0,     41,         1,               0.919
  4672,      0,     73,         0,               0.772
  4672,      0,     73,         1,               0.772
  4672,     41,     41,         0,               1.012
  4672,     41,     41,         1,               1.012
  4672,     73,     73,         0,               0.973
  4672,     73,     73,         1,               0.973
  4672,   2048,      0,         0,               1.014
  4672,   2048,      0,         1,               1.014
  4672,   2089,      0,         0,               1.003
  4672,   2089,      0,         1,               1.002
  4672,   2048,     41,         0,               0.907
  4672,   2048,     41,         1,               0.908
  4672,   2089,     41,         0,               1.012
  4672,   2089,     41,         1,               1.012
  4672,     41,      1,         0,                1.02
  4672,     41,      1,         1,                1.02
  4672,      1,     41,         0,               0.916
  4672,      1,     41,         1,               0.914
  4672,     73,      1,         0,               1.024
  4672,     73,      1,         1,               1.024
  4672,      1,     73,         0,               0.927
  4672,      1,     73,         1,               0.927
  4672,   2089,      1,         0,               1.019
  4672,   2089,      1,         1,                1.02
  4672,   2049,     41,         0,               0.912
  4672,   2049,     41,         1,               0.912
  4736,      0,      0,         0,               1.007
  4736,      0,      0,         1,               1.006
  4736,     42,      0,         0,               1.012
  4736,     42,      0,         1,               1.013
  4736,     74,      0,         0,               0.976
  4736,     74,      0,         1,               0.975
  4736,      0,     42,         0,                0.93
  4736,      0,     42,         1,               0.931
  4736,      0,     74,         0,               0.769
  4736,      0,     74,         1,                0.77
  4736,     42,     42,         0,               1.007
  4736,     42,     42,         1,               1.007
  4736,     74,     74,         0,               0.965
  4736,     74,     74,         1,               0.965
  4736,   2048,      0,         0,               1.006
  4736,   2048,      0,         1,               1.007
  4736,   2090,      0,         0,               1.012
  4736,   2090,      0,         1,               1.013
  4736,   2048,     42,         0,               0.902
  4736,   2048,     42,         1,               0.901
  4736,   2090,     42,         0,               1.007
  4736,   2090,     42,         1,               1.007
  4736,     42,      1,         0,               1.032
  4736,     42,      1,         1,               1.032
  4736,      1,     42,         0,               0.919
  4736,      1,     42,         1,               0.919
  4736,     74,      1,         0,               1.017
  4736,     74,      1,         1,               1.018
  4736,      1,     74,         0,               0.919
  4736,      1,     74,         1,               0.918
  4736,   2090,      1,         0,               1.031
  4736,   2090,      1,         1,               1.031
  4736,   2049,     42,         0,               0.916
  4736,   2049,     42,         1,               0.916
  4800,      0,      0,         0,               1.012
  4800,      0,      0,         1,               1.012
  4800,     43,      0,         0,               1.008
  4800,     43,      0,         1,               1.009
  4800,     75,      0,         0,                0.99
  4800,     75,      0,         1,                0.99
  4800,      0,     43,         0,               0.929
  4800,      0,     43,         1,               0.927
  4800,      0,     75,         0,               0.768
  4800,      0,     75,         1,               0.768
  4800,     43,     43,         0,               1.004
  4800,     43,     43,         1,               1.004
  4800,     75,     75,         0,               0.965
  4800,     75,     75,         1,               0.965
  4800,   2048,      0,         0,               1.012
  4800,   2048,      0,         1,               1.012
  4800,   2091,      0,         0,               1.009
  4800,   2091,      0,         1,               1.008
  4800,   2048,     43,         0,               0.901
  4800,   2048,     43,         1,               0.901
  4800,   2091,     43,         0,               1.004
  4800,   2091,     43,         1,               1.004
  4800,     43,      1,         0,               1.026
  4800,     43,      1,         1,               1.026
  4800,      1,     43,         0,               0.923
  4800,      1,     43,         1,               0.922
  4800,     75,      1,         0,               0.993
  4800,     75,      1,         1,               0.991
  4800,      1,     75,         0,               0.921
  4800,      1,     75,         1,                0.92
  4800,   2091,      1,         0,               1.026
  4800,   2091,      1,         1,               1.026
  4800,   2049,     43,         0,                0.92
  4800,   2049,     43,         1,               0.919
  4864,      0,      0,         0,               0.999
  4864,      0,      0,         1,               0.999
  4864,     44,      0,         0,               0.998
  4864,     44,      0,         1,               0.998
  4864,     76,      0,         0,               0.981
  4864,     76,      0,         1,               0.981
  4864,      0,     44,         0,               0.916
  4864,      0,     44,         1,               0.918
  4864,      0,     76,         0,               0.772
  4864,      0,     76,         1,               0.771
  4864,     44,     44,         0,               1.006
  4864,     44,     44,         1,               1.005
  4864,     76,     76,         0,                0.97
  4864,     76,     76,         1,                0.97
  4864,   2048,      0,         0,               0.999
  4864,   2048,      0,         1,               0.999
  4864,   2092,      0,         0,               0.997
  4864,   2092,      0,         1,               0.997
  4864,   2048,     44,         0,               0.908
  4864,   2048,     44,         1,               0.907
  4864,   2092,     44,         0,               1.005
  4864,   2092,     44,         1,               1.005
  4864,     44,      1,         0,               0.893
  4864,     44,      1,         1,               0.893
  4864,      1,     44,         0,               0.922
  4864,      1,     44,         1,               0.921
  4864,     76,      1,         0,               0.866
  4864,     76,      1,         1,               0.866
  4864,      1,     76,         0,               0.919
  4864,      1,     76,         1,               0.919
  4864,   2092,      1,         0,               0.893
  4864,   2092,      1,         1,               0.893
  4864,   2049,     44,         0,               0.919
  4864,   2049,     44,         1,               0.919
  4928,      0,      0,         0,               1.005
  4928,      0,      0,         1,               1.005
  4928,     45,      0,         0,               1.005
  4928,     45,      0,         1,               1.005
  4928,     77,      0,         0,                0.97
  4928,     77,      0,         1,                0.97
  4928,      0,     45,         0,               0.931
  4928,      0,     45,         1,               0.932
  4928,      0,     77,         0,               0.771
  4928,      0,     77,         1,               0.771
  4928,     45,     45,         0,                 1.0
  4928,     45,     45,         1,                 1.0
  4928,     77,     77,         0,               0.972
  4928,     77,     77,         1,               0.972
  4928,   2048,      0,         0,               1.005
  4928,   2048,      0,         1,               1.005
  4928,   2093,      0,         0,               1.005
  4928,   2093,      0,         1,               1.005
  4928,   2048,     45,         0,               0.904
  4928,   2048,     45,         1,               0.905
  4928,   2093,     45,         0,                 1.0
  4928,   2093,     45,         1,                 1.0
  4928,     45,      1,         0,               1.024
  4928,     45,      1,         1,               1.024
  4928,      1,     45,         0,               0.913
  4928,      1,     45,         1,               0.912
  4928,     77,      1,         0,               0.996
  4928,     77,      1,         1,               0.996
  4928,      1,     77,         0,               0.925
  4928,      1,     77,         1,               0.925
  4928,   2093,      1,         0,               1.025
  4928,   2093,      1,         1,               1.024
  4928,   2049,     45,         0,               0.916
  4928,   2049,     45,         1,               0.911
  4992,      0,      0,         0,                 1.0
  4992,      0,      0,         1,                 1.0
  4992,     46,      0,         0,               1.009
  4992,     46,      0,         1,               1.009
  4992,     78,      0,         0,               0.992
  4992,     78,      0,         1,               0.992
  4992,      0,     46,         0,               0.908
  4992,      0,     46,         1,               0.908
  4992,      0,     78,         0,               0.751
  4992,      0,     78,         1,               0.752
  4992,     46,     46,         0,               0.997
  4992,     46,     46,         1,               0.997
  4992,     78,     78,         0,               0.968
  4992,     78,     78,         1,               0.969
  4992,   2048,      0,         0,                 1.0
  4992,   2048,      0,         1,               1.001
  4992,   2094,      0,         0,               1.008
  4992,   2094,      0,         1,               1.009
  4992,   2048,     46,         0,               0.883
  4992,   2048,     46,         1,               0.883
  4992,   2094,     46,         0,               0.997
  4992,   2094,     46,         1,               0.997
  4992,     46,      1,         0,               1.025
  4992,     46,      1,         1,               1.025
  4992,      1,     46,         0,               0.923
  4992,      1,     46,         1,               0.923
  4992,     78,      1,         0,                 1.0
  4992,     78,      1,         1,               1.001
  4992,      1,     78,         0,                0.92
  4992,      1,     78,         1,                0.92
  4992,   2094,      1,         0,               1.025
  4992,   2094,      1,         1,               1.026
  4992,   2049,     46,         0,                0.92
  4992,   2049,     46,         1,               0.921
  5056,      0,      0,         0,               1.002
  5056,      0,      0,         1,               1.001
  5056,     47,      0,         0,               1.006
  5056,     47,      0,         1,               1.006
  5056,     79,      0,         0,                0.99
  5056,     79,      0,         1,               0.988
  5056,      0,     47,         0,               0.917
  5056,      0,     47,         1,               0.916
  5056,      0,     79,         0,               0.771
  5056,      0,     79,         1,               0.772
  5056,     47,     47,         0,               1.006
  5056,     47,     47,         1,               1.006
  5056,     79,     79,         0,               0.972
  5056,     79,     79,         1,               0.973
  5056,   2048,      0,         0,               1.003
  5056,   2048,      0,         1,               1.001
  5056,   2095,      0,         0,               1.005
  5056,   2095,      0,         1,               1.004
  5056,   2048,     47,         0,               0.908
  5056,   2048,     47,         1,               0.909
  5056,   2095,     47,         0,               1.006
  5056,   2095,     47,         1,               1.006
  5056,     47,      1,         0,               1.032
  5056,     47,      1,         1,               1.034
  5056,      1,     47,         0,               0.926
  5056,      1,     47,         1,               0.926
  5056,     79,      1,         0,               1.003
  5056,     79,      1,         1,               1.004
  5056,      1,     79,         0,               0.927
  5056,      1,     79,         1,               0.927
  5056,   2095,      1,         0,               1.034
  5056,   2095,      1,         1,               1.033
  5056,   2049,     47,         0,               0.924
  5056,   2049,     47,         1,               0.923
  5120,      0,      0,         0,               1.003
  5120,      0,      0,         1,               1.004
  5120,     48,      0,         0,               1.068
  5120,     48,      0,         1,               1.068
  5120,     80,      0,         0,               1.068
  5120,     80,      0,         1,               1.068
  5120,      0,     48,         0,               1.065
  5120,      0,     48,         1,               1.064
  5120,      0,     80,         0,               1.065
  5120,      0,     80,         1,               1.065
  5120,     48,     48,         0,               1.004
  5120,     48,     48,         1,               1.005
  5120,     80,     80,         0,               1.005
  5120,     80,     80,         1,               1.005
  5120,   2048,      0,         0,               1.005
  5120,   2048,      0,         1,               1.005
  5120,   2096,      0,         0,               1.068
  5120,   2096,      0,         1,               1.068
  5120,   2048,     48,         0,               1.066
  5120,   2048,     48,         1,               1.065
  5120,   2096,     48,         0,               1.005
  5120,   2096,     48,         1,               1.005
  5120,     48,      1,         0,               1.032
  5120,     48,      1,         1,               1.032
  5120,      1,     48,         0,               0.899
  5120,      1,     48,         1,               0.899
  5120,     80,      1,         0,               0.844
  5120,     80,      1,         1,               0.843
  5120,      1,     80,         0,               0.892
  5120,      1,     80,         1,               0.892
  5120,   2096,      1,         0,               0.856
  5120,   2096,      1,         1,               0.856
  5120,   2049,     48,         0,               0.898
  5120,   2049,     48,         1,               0.898


Results For: bench-memcpy-large
  length, align1, align2, dst > src, New Time / Old Time
   65543,      0,      0,         0,               0.977
   65543,      0,      0,         1,               0.976
   65551,      0,      3,         0,                1.01
   65551,      0,      3,         1,               1.011
   65567,      3,      0,         0,                1.02
   65567,      3,      0,         1,                1.02
   65599,      3,      5,         0,               1.056
   65599,      3,      5,         1,               1.057
   65536,      0,    127,         0,               1.043
   65536,      0,    127,         1,               1.043
   65536,      0,    255,         0,                1.07
   65536,      0,    255,         1,               1.071
   65536,      0,    256,         0,               0.978
   65536,      0,    256,         1,               0.979
   65536,      0,   4064,         0,               1.017
   65536,      0,   4064,         1,               1.018
  131079,      0,      0,         0,               0.979
  131079,      0,      0,         1,               0.979
  131087,      0,      3,         0,               1.016
  131087,      0,      3,         1,               1.016
  131103,      3,      0,         0,               1.022
  131103,      3,      0,         1,               1.022
  131135,      3,      5,         0,               1.063
  131135,      3,      5,         1,               1.063
  131072,      0,    127,         0,               1.048
  131072,      0,    127,         1,               1.048
  131072,      0,    255,         0,               1.074
  131072,      0,    255,         1,               1.074
  131072,      0,    256,         0,               0.982
  131072,      0,    256,         1,               0.982
  131072,      0,   4064,         0,               1.018
  131072,      0,   4064,         1,               1.019
  262151,      0,      0,         0,               0.984
  262151,      0,      0,         1,               0.984
  262159,      0,      3,         0,               1.024
  262159,      0,      3,         1,               1.024
  262175,      3,      0,         0,                1.03
  262175,      3,      0,         1,                1.03
  262207,      3,      5,         0,               1.068
  262207,      3,      5,         1,               1.069
  262144,      0,    127,         0,               1.056
  262144,      0,    127,         1,               1.056
  262144,      0,    255,         0,               1.078
  262144,      0,    255,         1,               1.078
  262144,      0,    256,         0,               0.986
  262144,      0,    256,         1,               0.986
  262144,      0,   4064,         0,                1.02
  262144,      0,   4064,         1,                1.02
  524295,      0,      0,         0,               0.692
  524295,      0,      0,         1,               0.692
  524303,      0,      3,         0,               0.736
  524303,      0,      3,         1,               0.736
  524319,      3,      0,         0,               0.759
  524319,      3,      0,         1,               0.759
  524351,      3,      5,         0,               0.758
  524351,      3,      5,         1,               0.759
  524288,      0,    127,         0,               1.057
  524288,      0,    127,         1,               1.057
  524288,      0,    255,         0,               1.079
  524288,      0,    255,         1,               1.079
  524288,      0,    256,         0,               0.987
  524288,      0,    256,         1,               0.987
  524288,      0,   4064,         0,                1.02
  524288,      0,   4064,         1,                1.02
 1048583,      0,      0,         0,               0.948
 1048583,      0,      0,         1,               0.949
 1048591,      0,      3,         0,               0.734
 1048591,      0,      3,         1,               0.735
 1048607,      3,      0,         0,               0.758
 1048607,      3,      0,         1,               0.757
 1048639,      3,      5,         0,               0.757
 1048639,      3,      5,         1,               0.757
 1048576,      0,    127,         0,               0.761
 1048576,      0,    127,         1,               0.763
 1048576,      0,    255,         0,               0.751
 1048576,      0,    255,         1,               0.751
 1048576,      0,    256,         0,                0.93
 1048576,      0,    256,         1,                0.93
 1048576,      0,   4064,         0,                0.93
 1048576,      0,   4064,         1,                0.93
 2097159,      0,      0,         0,               0.928
 2097159,      0,      0,         1,               0.931
 2097167,      0,      3,         0,               0.735
 2097167,      0,      3,         1,               0.734
 2097183,      3,      0,         0,               0.759
 2097183,      3,      0,         1,                0.76
 2097215,      3,      5,         0,               0.758
 2097215,      3,      5,         1,               0.757
 2097152,      0,    127,         0,                0.77
 2097152,      0,    127,         1,                0.77
 2097152,      0,    255,         0,               0.745
 2097152,      0,    255,         1,               0.745
 2097152,      0,    256,         0,               0.924
 2097152,      0,    256,         1,               0.925
 2097152,      0,   4064,         0,               0.926
 2097152,      0,   4064,         1,               0.927
 4194311,      0,      0,         0,               0.886
 4194311,      0,      0,         1,                0.89
 4194319,      0,      3,         0,               0.746
 4194319,      0,      3,         1,               0.745
 4194335,      3,      0,         0,               0.816
 4194335,      3,      0,         1,               0.816
 4194367,      3,      5,         0,                0.78
 4194367,      3,      5,         1,               0.781
 4194304,      0,    127,         0,               0.792
 4194304,      0,    127,         1,               0.791
 4194304,      0,    255,         0,               0.803
 4194304,      0,    255,         1,               0.799
 4194304,      0,    256,         0,               0.865
 4194304,      0,    256,         1,               0.863
 4194304,      0,   4064,         0,               0.953
 4194304,      0,   4064,         1,                0.95
 8388615,      0,      0,         0,               0.876
 8388615,      0,      0,         1,               0.877
 8388623,      0,      3,         0,               0.762
 8388623,      0,      3,         1,               0.762
 8388639,      3,      0,         0,               0.871
 8388639,      3,      0,         1,                0.87
 8388671,      3,      5,         0,               0.805
 8388671,      3,      5,         1,               0.808
 8388608,      0,    127,         0,               0.824
 8388608,      0,    127,         1,               0.823
 8388608,      0,    255,         0,               0.858
 8388608,      0,    255,         1,               0.857
 8388608,      0,    256,         0,               0.843
 8388608,      0,    256,         1,                0.84
 8388608,      0,   4064,         0,               0.981
 8388608,      0,   4064,         1,               0.981
16777223,      0,      0,         0,               0.881
16777223,      0,      0,         1,               0.882
16777231,      0,      3,         0,               0.765
16777231,      0,      3,         1,               0.765
16777247,      3,      0,         0,                0.87
16777247,      3,      0,         1,                0.87
16777279,      3,      5,         0,               0.807
16777279,      3,      5,         1,               0.811
16777216,      0,    127,         0,               0.827
16777216,      0,    127,         1,               0.827
16777216,      0,    255,         0,               0.858
16777216,      0,    255,         1,               0.857
16777216,      0,    256,         0,               0.848
16777216,      0,    256,         1,               0.844
16777216,      0,   4064,         0,                0.98
16777216,      0,   4064,         1,               0.981
33554439,      0,      0,         0,               0.883
33554439,      0,      0,         1,               0.884
33554447,      0,      3,         0,               0.767
33554447,      0,      3,         1,               0.766
33554463,      3,      0,         0,                0.87
33554463,      3,      0,         1,                0.87
33554495,      3,      5,         0,               0.809
33554495,      3,      5,         1,               0.813
33554432,      0,    127,         0,               0.829
33554432,      0,    127,         1,               0.829
33554432,      0,    255,         0,               0.857
33554432,      0,    255,         1,               0.857
33554432,      0,    256,         0,                0.85
33554432,      0,    256,         1,               0.846
33554432,      0,   4064,         0,               0.981
33554432,      0,   4064,         1,               0.981


Results For: bench-memcpy-random
 length, New Time / Old Time
  32768,               0.888
  65536,               0.906
 131072,               0.915
 262144,               0.919
 524288,               0.921
1048576,               0.929

 sysdeps/x86_64/multiarch/Makefile        |    1 -
 sysdeps/x86_64/multiarch/memcpy-ssse3.S  | 3151 ----------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S |  384 ++-
 3 files changed, 380 insertions(+), 3156 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..215583e7bd 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,380 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_15):
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+	cmpl	$8, %edx
+	jb	L(copy_4_7)
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_4_7):
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	decl	%edx
+	jl	L(copy_0_0)
+	movb	(%rsi), %cl
+	je	L(copy_1_1)
+
+	movzwl	-1(%rsi, %rdx), %esi
+	movw	%si, -1(%rdi, %rdx)
+L(copy_1_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+	ret
+
+	.p2align 4,, 4
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3
  2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                       ` (4 preceding siblings ...)
  2022-04-14 16:47     ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
@ 2022-04-14 18:04     ` H.J. Lu
  5 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-04-14 18:04 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: GNU C Library, Carlos O'Donell

On Thu, Apr 14, 2022 at 9:47 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result it is no longer worth it to keep the SSSE3
> versions given the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
>  sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
>  sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
>  sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
>  5 files changed, 2006 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 6507d1b7fa..51222dfab1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -12,7 +12,6 @@ sysdep_routines += \
>    memcmp-evex-movbe \
>    memcmp-sse2 \
>    memcmp-sse4 \
> -  memcmp-ssse3 \
>    memcmpeq-avx2 \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
> @@ -179,7 +178,6 @@ sysdep_routines += \
>    wmemcmp-c \
>    wmemcmp-evex-movbe \
>    wmemcmp-sse4 \
> -  wmemcmp-ssse3 \
>  # sysdep_routines
>  endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 40cc6cc49e..f389928a4e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memcmp_evex_movbe)
>               IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
>                               __memcmp_sse4_1)
> -             IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
>  #ifdef SHARED
> @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __wmemcmp_evex_movbe)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
>                               __wmemcmp_sse4_1)
> -             IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __wmemcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index cd12613699..44759a3ad5 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
>  # include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
>      return OPTIMIZE (sse4_1);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> deleted file mode 100644
> index df1b1fc494..0000000000
> --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> +++ /dev/null
> @@ -1,1992 +0,0 @@
> -/* memcmp with SSSE3, wmemcmp with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -#  define MEMCMP       __memcmp_ssse3
> -# endif
> -
> -/* Warning!
> -          wmemcmp has to use SIGNED comparison for elements.
> -          memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> -       atom_text_section
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> -       shl     $2, %RDX_LP
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(equal)
> -# elif defined __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -# endif
> -       mov     %rdx, %rcx
> -       mov     %rdi, %rdx
> -       cmp     $48, %rcx;
> -       jae     L(48bytesormore)        /* LEN => 48  */
> -
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -/* ECX >= 32.  */
> -L(48bytesormore):
> -       movdqu  (%rdi), %xmm3
> -       movdqu  (%rsi), %xmm0
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     16(%rdi), %rdi
> -       lea     16(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(less16bytes)
> -       mov     %edi, %edx
> -       and     $0xf, %edx
> -       xor     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       add     %rdx, %rcx
> -       mov     %esi, %edx
> -       and     $0xf, %edx
> -       jz      L(shr_0)
> -       xor     %rdx, %rsi
> -
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $8, %edx
> -       jae     L(next_unaligned_table)
> -       cmp     $0, %edx
> -       je      L(shr_0)
> -       cmp     $1, %edx
> -       je      L(shr_1)
> -       cmp     $2, %edx
> -       je      L(shr_2)
> -       cmp     $3, %edx
> -       je      L(shr_3)
> -       cmp     $4, %edx
> -       je      L(shr_4)
> -       cmp     $5, %edx
> -       je      L(shr_5)
> -       cmp     $6, %edx
> -       je      L(shr_6)
> -       jmp     L(shr_7)
> -
> -       .p2align 2
> -L(next_unaligned_table):
> -       cmp     $8, %edx
> -       je      L(shr_8)
> -       cmp     $9, %edx
> -       je      L(shr_9)
> -       cmp     $10, %edx
> -       je      L(shr_10)
> -       cmp     $11, %edx
> -       je      L(shr_11)
> -       cmp     $12, %edx
> -       je      L(shr_12)
> -       cmp     $13, %edx
> -       je      L(shr_13)
> -       cmp     $14, %edx
> -       je      L(shr_14)
> -       jmp     L(shr_15)
> -# else
> -       cmp     $0, %edx
> -       je      L(shr_0)
> -       cmp     $4, %edx
> -       je      L(shr_4)
> -       cmp     $8, %edx
> -       je      L(shr_8)
> -       jmp     L(shr_12)
> -# endif
> -
> -       .p2align 4
> -L(shr_0):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       jae     L(shr_0_gobble)
> -       xor     %eax, %eax
> -       movdqa  (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -       movdqa  16(%rsi), %xmm2
> -       pcmpeqb 16(%rdi), %xmm2
> -       pand    %xmm1, %xmm2
> -       pmovmskb %xmm2, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_0_gobble):
> -       movdqa  (%rsi), %xmm0
> -       xor     %eax, %eax
> -       pcmpeqb (%rdi), %xmm0
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm2
> -       pcmpeqb 16(%rdi), %xmm2
> -L(shr_0_gobble_loop):
> -       pand    %xmm0, %xmm2
> -       sub     $32, %rcx
> -       pmovmskb %xmm2, %edx
> -       movdqa  %xmm0, %xmm1
> -       movdqa  32(%rsi), %xmm0
> -       movdqa  48(%rsi), %xmm2
> -       sbb     $0xffff, %edx
> -       pcmpeqb 32(%rdi), %xmm0
> -       pcmpeqb 48(%rdi), %xmm2
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       jz      L(shr_0_gobble_loop)
> -
> -       pand    %xmm0, %xmm2
> -       cmp     $0, %rcx
> -       jge     L(next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm2, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_1):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_1_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $1, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $1, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $1, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_1_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $1, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $1, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_1_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $1, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $1, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_1_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_1_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_1_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     1(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -
> -       .p2align 4
> -L(shr_2):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_2_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $2, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $2, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $2, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_2_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $2, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $2, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_2_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $2, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $2, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_2_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_2_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_2_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     2(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_3):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_3_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $3, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $3, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $3, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_3_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $3, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $3, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_3_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $3, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $3, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_3_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_3_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_3_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     3(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_4):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_4_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $4, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $4, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $4, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_4_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $4, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $4, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_4_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $4, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $4, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_4_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_4_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_4_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     4(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_5):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_5_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $5, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $5, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $5, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_5_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $5, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $5, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_5_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $5, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $5, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_5_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_5_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_5_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     5(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_6):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_6_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $6, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $6, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $6, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_6_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $6, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $6, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_6_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $6, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $6, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_6_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_6_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_6_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     6(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_7):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_7_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $7, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $7, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $7, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_7_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $7, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $7, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_7_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $7, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $7, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_7_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_7_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_7_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     7(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_8):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_8_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $8, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $8, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $8, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_8_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $8, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $8, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_8_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $8, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $8, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_8_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_8_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_8_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     8(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_9):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_9_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $9, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $9, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $9, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_9_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $9, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $9, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_9_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $9, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $9, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_9_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_9_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_9_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     9(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_10):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_10_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $10, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $10, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $10, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_10_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $10, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $10, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_10_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $10, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $10, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_10_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_10_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_10_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     10(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_11):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_11_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $11, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $11, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $11, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_11_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $11, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $11, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_11_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $11, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $11, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_11_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_11_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_11_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     11(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_12):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_12_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $12, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $12, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $12, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_12_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $12, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $12, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_12_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $12, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $12, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_12_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_12_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_12_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     12(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_13):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_13_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $13, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $13, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $13, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_13_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $13, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $13, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_13_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $13, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $13, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_13_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_13_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_13_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     13(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_14):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_14_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $14, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $14, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $14, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_14_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $14, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $14, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_14_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $14, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $14, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_14_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_14_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_14_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     14(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_15):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_15_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $15, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $15, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $15, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_15_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $15, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $15, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_15_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $15, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $15, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_15_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_15_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_15_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     15(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -# endif
> -       .p2align 4
> -L(exit):
> -       pmovmskb %xmm1, %r8d
> -       sub     $0xffff, %r8d
> -       jz      L(first16bytes)
> -       lea     -16(%rsi), %rsi
> -       lea     -16(%rdi), %rdi
> -       mov     %r8d, %edx
> -L(first16bytes):
> -       add     %rax, %rsi
> -L(less16bytes):
> -# ifndef USE_AS_WMEMCMP
> -       test    %dl, %dl
> -       jz      L(next_24_bytes)
> -
> -       test    $0x01, %dl
> -       jnz     L(Byte16)
> -
> -       test    $0x02, %dl
> -       jnz     L(Byte17)
> -
> -       test    $0x04, %dl
> -       jnz     L(Byte18)
> -
> -       test    $0x08, %dl
> -       jnz     L(Byte19)
> -
> -       test    $0x10, %dl
> -       jnz     L(Byte20)
> -
> -       test    $0x20, %dl
> -       jnz     L(Byte21)
> -
> -       test    $0x40, %dl
> -       jnz     L(Byte22)
> -
> -       movzbl  -9(%rdi), %eax
> -       movzbl  -9(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte16):
> -       movzbl  -16(%rdi), %eax
> -       movzbl  -16(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte17):
> -       movzbl  -15(%rdi), %eax
> -       movzbl  -15(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte18):
> -       movzbl  -14(%rdi), %eax
> -       movzbl  -14(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte19):
> -       movzbl  -13(%rdi), %eax
> -       movzbl  -13(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte20):
> -       movzbl  -12(%rdi), %eax
> -       movzbl  -12(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte21):
> -       movzbl  -11(%rdi), %eax
> -       movzbl  -11(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte22):
> -       movzbl  -10(%rdi), %eax
> -       movzbl  -10(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(next_24_bytes):
> -       lea     8(%rdi), %rdi
> -       lea     8(%rsi), %rsi
> -       test    $0x01, %dh
> -       jnz     L(Byte16)
> -
> -       test    $0x02, %dh
> -       jnz     L(Byte17)
> -
> -       test    $0x04, %dh
> -       jnz     L(Byte18)
> -
> -       test    $0x08, %dh
> -       jnz     L(Byte19)
> -
> -       test    $0x10, %dh
> -       jnz     L(Byte20)
> -
> -       test    $0x20, %dh
> -       jnz     L(Byte21)
> -
> -       test    $0x40, %dh
> -       jnz     L(Byte22)
> -
> -       movzbl  -9(%rdi), %eax
> -       movzbl  -9(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -# else
> -/* special for wmemcmp */
> -       xor     %eax, %eax
> -       test    %dl, %dl
> -       jz      L(next_two_double_words)
> -       and     $15, %dl
> -       jz      L(second_double_word)
> -       mov     -16(%rdi), %eax
> -       cmp     -16(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(second_double_word):
> -       mov     -12(%rdi), %eax
> -       cmp     -12(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(next_two_double_words):
> -       and     $15, %dh
> -       jz      L(fourth_double_word)
> -       mov     -8(%rdi), %eax
> -       cmp     -8(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(fourth_double_word):
> -       mov     -4(%rdi), %eax
> -       cmp     -4(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(less48bytes):
> -       cmp     $8, %ecx
> -       jae     L(more8bytes)
> -       cmp     $0, %ecx
> -       je      L(0bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $1, %ecx
> -       je      L(1bytes)
> -       cmp     $2, %ecx
> -       je      L(2bytes)
> -       cmp     $3, %ecx
> -       je      L(3bytes)
> -       cmp     $4, %ecx
> -       je      L(4bytes)
> -       cmp     $5, %ecx
> -       je      L(5bytes)
> -       cmp     $6, %ecx
> -       je      L(6bytes)
> -       jmp     L(7bytes)
> -# else
> -       jmp     L(4bytes)
> -# endif
> -
> -       .p2align 4
> -L(more8bytes):
> -       cmp     $16, %ecx
> -       jae     L(more16bytes)
> -       cmp     $8, %ecx
> -       je      L(8bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $9, %ecx
> -       je      L(9bytes)
> -       cmp     $10, %ecx
> -       je      L(10bytes)
> -       cmp     $11, %ecx
> -       je      L(11bytes)
> -       cmp     $12, %ecx
> -       je      L(12bytes)
> -       cmp     $13, %ecx
> -       je      L(13bytes)
> -       cmp     $14, %ecx
> -       je      L(14bytes)
> -       jmp     L(15bytes)
> -# else
> -       jmp     L(12bytes)
> -# endif
> -
> -       .p2align 4
> -L(more16bytes):
> -       cmp     $24, %ecx
> -       jae     L(more24bytes)
> -       cmp     $16, %ecx
> -       je      L(16bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $17, %ecx
> -       je      L(17bytes)
> -       cmp     $18, %ecx
> -       je      L(18bytes)
> -       cmp     $19, %ecx
> -       je      L(19bytes)
> -       cmp     $20, %ecx
> -       je      L(20bytes)
> -       cmp     $21, %ecx
> -       je      L(21bytes)
> -       cmp     $22, %ecx
> -       je      L(22bytes)
> -       jmp     L(23bytes)
> -# else
> -       jmp     L(20bytes)
> -# endif
> -
> -       .p2align 4
> -L(more24bytes):
> -       cmp     $32, %ecx
> -       jae     L(more32bytes)
> -       cmp     $24, %ecx
> -       je      L(24bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $25, %ecx
> -       je      L(25bytes)
> -       cmp     $26, %ecx
> -       je      L(26bytes)
> -       cmp     $27, %ecx
> -       je      L(27bytes)
> -       cmp     $28, %ecx
> -       je      L(28bytes)
> -       cmp     $29, %ecx
> -       je      L(29bytes)
> -       cmp     $30, %ecx
> -       je      L(30bytes)
> -       jmp     L(31bytes)
> -# else
> -       jmp     L(28bytes)
> -# endif
> -
> -       .p2align 4
> -L(more32bytes):
> -       cmp     $40, %ecx
> -       jae     L(more40bytes)
> -       cmp     $32, %ecx
> -       je      L(32bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $33, %ecx
> -       je      L(33bytes)
> -       cmp     $34, %ecx
> -       je      L(34bytes)
> -       cmp     $35, %ecx
> -       je      L(35bytes)
> -       cmp     $36, %ecx
> -       je      L(36bytes)
> -       cmp     $37, %ecx
> -       je      L(37bytes)
> -       cmp     $38, %ecx
> -       je      L(38bytes)
> -       jmp     L(39bytes)
> -# else
> -       jmp     L(36bytes)
> -# endif
> -
> -       .p2align 4
> -L(more40bytes):
> -       cmp     $40, %ecx
> -       je      L(40bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $41, %ecx
> -       je      L(41bytes)
> -       cmp     $42, %ecx
> -       je      L(42bytes)
> -       cmp     $43, %ecx
> -       je      L(43bytes)
> -       cmp     $44, %ecx
> -       je      L(44bytes)
> -       cmp     $45, %ecx
> -       je      L(45bytes)
> -       cmp     $46, %ecx
> -       je      L(46bytes)
> -       jmp     L(47bytes)
> -
> -       .p2align 4
> -L(44bytes):
> -       movl    -44(%rdi), %eax
> -       movl    -44(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(40bytes):
> -       movl    -40(%rdi), %eax
> -       movl    -40(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(36bytes):
> -       movl    -36(%rdi), %eax
> -       movl    -36(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(32bytes):
> -       movl    -32(%rdi), %eax
> -       movl    -32(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(28bytes):
> -       movl    -28(%rdi), %eax
> -       movl    -28(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(24bytes):
> -       movl    -24(%rdi), %eax
> -       movl    -24(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(20bytes):
> -       movl    -20(%rdi), %eax
> -       movl    -20(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(16bytes):
> -       movl    -16(%rdi), %eax
> -       movl    -16(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(12bytes):
> -       movl    -12(%rdi), %eax
> -       movl    -12(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(8bytes):
> -       movl    -8(%rdi), %eax
> -       movl    -8(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(4bytes):
> -       movl    -4(%rdi), %eax
> -       movl    -4(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(0bytes):
> -       xor     %eax, %eax
> -       ret
> -# else
> -       .p2align 4
> -L(44bytes):
> -       movl    -44(%rdi), %eax
> -       cmp     -44(%rsi), %eax
> -       jne     L(find_diff)
> -L(40bytes):
> -       movl    -40(%rdi), %eax
> -       cmp     -40(%rsi), %eax
> -       jne     L(find_diff)
> -L(36bytes):
> -       movl    -36(%rdi), %eax
> -       cmp     -36(%rsi), %eax
> -       jne     L(find_diff)
> -L(32bytes):
> -       movl    -32(%rdi), %eax
> -       cmp     -32(%rsi), %eax
> -       jne     L(find_diff)
> -L(28bytes):
> -       movl    -28(%rdi), %eax
> -       cmp     -28(%rsi), %eax
> -       jne     L(find_diff)
> -L(24bytes):
> -       movl    -24(%rdi), %eax
> -       cmp     -24(%rsi), %eax
> -       jne     L(find_diff)
> -L(20bytes):
> -       movl    -20(%rdi), %eax
> -       cmp     -20(%rsi), %eax
> -       jne     L(find_diff)
> -L(16bytes):
> -       movl    -16(%rdi), %eax
> -       cmp     -16(%rsi), %eax
> -       jne     L(find_diff)
> -L(12bytes):
> -       movl    -12(%rdi), %eax
> -       cmp     -12(%rsi), %eax
> -       jne     L(find_diff)
> -L(8bytes):
> -       movl    -8(%rdi), %eax
> -       cmp     -8(%rsi), %eax
> -       jne     L(find_diff)
> -L(4bytes):
> -       movl    -4(%rdi), %eax
> -       cmp     -4(%rsi), %eax
> -       jne     L(find_diff)
> -L(0bytes):
> -       xor     %eax, %eax
> -       ret
> -# endif
> -
> -# ifndef USE_AS_WMEMCMP
> -       .p2align 4
> -L(45bytes):
> -       movl    -45(%rdi), %eax
> -       movl    -45(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(41bytes):
> -       movl    -41(%rdi), %eax
> -       movl    -41(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(37bytes):
> -       movl    -37(%rdi), %eax
> -       movl    -37(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(33bytes):
> -       movl    -33(%rdi), %eax
> -       movl    -33(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(29bytes):
> -       movl    -29(%rdi), %eax
> -       movl    -29(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(25bytes):
> -       movl    -25(%rdi), %eax
> -       movl    -25(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(21bytes):
> -       movl    -21(%rdi), %eax
> -       movl    -21(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(17bytes):
> -       movl    -17(%rdi), %eax
> -       movl    -17(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(13bytes):
> -       movl    -13(%rdi), %eax
> -       movl    -13(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(9bytes):
> -       movl    -9(%rdi), %eax
> -       movl    -9(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(5bytes):
> -       movl    -5(%rdi), %eax
> -       movl    -5(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(1bytes):
> -       movzbl  -1(%rdi), %eax
> -       cmpb    -1(%rsi), %al
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(46bytes):
> -       movl    -46(%rdi), %eax
> -       movl    -46(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(42bytes):
> -       movl    -42(%rdi), %eax
> -       movl    -42(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(38bytes):
> -       movl    -38(%rdi), %eax
> -       movl    -38(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(34bytes):
> -       movl    -34(%rdi), %eax
> -       movl    -34(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(30bytes):
> -       movl    -30(%rdi), %eax
> -       movl    -30(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(26bytes):
> -       movl    -26(%rdi), %eax
> -       movl    -26(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(22bytes):
> -       movl    -22(%rdi), %eax
> -       movl    -22(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(18bytes):
> -       movl    -18(%rdi), %eax
> -       movl    -18(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(14bytes):
> -       movl    -14(%rdi), %eax
> -       movl    -14(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(10bytes):
> -       movl    -10(%rdi), %eax
> -       movl    -10(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(6bytes):
> -       movl    -6(%rdi), %eax
> -       movl    -6(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(2bytes):
> -       movzwl  -2(%rdi), %eax
> -       movzwl  -2(%rsi), %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmp     %ecx, %eax
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(47bytes):
> -       movl    -47(%rdi), %eax
> -       movl    -47(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(43bytes):
> -       movl    -43(%rdi), %eax
> -       movl    -43(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(39bytes):
> -       movl    -39(%rdi), %eax
> -       movl    -39(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(35bytes):
> -       movl    -35(%rdi), %eax
> -       movl    -35(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(31bytes):
> -       movl    -31(%rdi), %eax
> -       movl    -31(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(27bytes):
> -       movl    -27(%rdi), %eax
> -       movl    -27(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(23bytes):
> -       movl    -23(%rdi), %eax
> -       movl    -23(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(19bytes):
> -       movl    -19(%rdi), %eax
> -       movl    -19(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(15bytes):
> -       movl    -15(%rdi), %eax
> -       movl    -15(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(11bytes):
> -       movl    -11(%rdi), %eax
> -       movl    -11(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(7bytes):
> -       movl    -7(%rdi), %eax
> -       movl    -7(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(3bytes):
> -       movzwl  -3(%rdi), %eax
> -       movzwl  -3(%rsi), %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmp     %ecx, %eax
> -       jne     L(set)
> -       movzbl  -1(%rdi), %eax
> -       cmpb    -1(%rsi), %al
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(find_diff):
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmpw    %cx, %ax
> -       jne     L(set)
> -       shr     $16, %eax
> -       shr     $16, %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -
> -/* We get there only if we already know there is a
> -difference.  */
> -
> -       cmp     %ecx, %eax
> -L(set):
> -       sbb     %eax, %eax
> -       sbb     $-1, %eax
> -       ret
> -# else
> -
> -/* for wmemcmp */
> -       .p2align 4
> -L(find_diff):
> -       mov     $1, %eax
> -       jg      L(find_diff_bigger)
> -       neg     %eax
> -       ret
> -
> -       .p2align 4
> -L(find_diff_bigger):
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(equal):
> -       xor     %eax, %eax
> -       ret
> -
> -END (MEMCMP)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> deleted file mode 100644
> index a41ef95fc1..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_ssse3
> -
> -#include "memcmp-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
  2022-03-25 19:56   ` H.J. Lu
  2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
                   ` (4 subsequent siblings)
  6 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
 sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
 5 files changed, 7 insertions(+), 3183 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index ed2def288d..48f81711ae 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
@@ -24,7 +23,6 @@ sysdep_routines += \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3 \
   memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7e2be3554b..70b0e9c62e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3_back)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index f8f958064c..1ecdd4b0d3 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
@@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
     {
-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
-	return OPTIMIZE (sse2_unaligned_erms);
-
-      return OPTIMIZE (sse2_unaligned);
+      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
+        return OPTIMIZE (ssse3_back);
     }
 
-  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-    return OPTIMIZE (ssse3_back);
+  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	return OPTIMIZE (sse2_unaligned_erms);
 
-  return OPTIMIZE (ssse3);
+  return OPTIMIZE (sse2_unaligned);
 }
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
deleted file mode 100644
index 295430b1ef..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 19:56   ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
>  sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
>  sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
>  sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
>  5 files changed, 7 insertions(+), 3183 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index ed2def288d..48f81711ae 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,7 +16,6 @@ sysdep_routines += \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
>    memcmpeq-sse2 \
> -  memcpy-ssse3 \
>    memcpy-ssse3-back \
>    memmove-avx-unaligned-erms \
>    memmove-avx-unaligned-erms-rtm \
> @@ -24,7 +23,6 @@ sysdep_routines += \
>    memmove-avx512-unaligned-erms \
>    memmove-evex-unaligned-erms \
>    memmove-sse2-unaligned-erms \
> -  memmove-ssse3 \
>    memmove-ssse3-back \
>    memrchr-avx2 \
>    memrchr-avx2-rtm \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 7e2be3554b..70b0e9c62e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -135,9 +135,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_chk_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, __memmove_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_chk_ssse3)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
>                               __memmove_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> @@ -179,8 +176,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memmove_avx512_unaligned_erms)
>               IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
>                               __memmove_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_ssse3)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
>               IFUNC_IMPL_ADD (array, i, memmove, 1,
>                               __memmove_sse2_unaligned)
> @@ -887,9 +882,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_chk_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_chk_ssse3)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
>                               __memcpy_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> @@ -922,8 +914,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memcpy_evex_unaligned_erms)
>               IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __memcpy_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, memcpy,
>                               CPU_FEATURE_USABLE (AVX512F),
>                               __memcpy_avx512_no_vzeroupper)
> @@ -973,9 +963,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_chk_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_chk_ssse3)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
>                               __mempcpy_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> @@ -1017,8 +1004,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __mempcpy_evex_unaligned_erms)
>               IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
>                               __mempcpy_ssse3_back)
> -             IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
>                               __mempcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index f8f958064c..1ecdd4b0d3 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -24,8 +24,6 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
>    attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
>    attribute_hidden;
> @@ -94,17 +92,15 @@ IFUNC_SELECTOR (void)
>         }
>      }
>
> -  if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> -      || CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> +  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> +      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
>      {
> -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> -       return OPTIMIZE (sse2_unaligned_erms);
> -
> -      return OPTIMIZE (sse2_unaligned);
> +      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> +        return OPTIMIZE (ssse3_back);
>      }
>
> -  if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> -    return OPTIMIZE (ssse3_back);
> +  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
> +       return OPTIMIZE (sse2_unaligned_erms);
>
> -  return OPTIMIZE (ssse3);
> +  return OPTIMIZE (sse2_unaligned);
>  }
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> deleted file mode 100644
> index 65644d3a09..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
> +++ /dev/null
> @@ -1,3151 +0,0 @@
> -/* memcpy with SSSE3
> -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY                __memcpy_ssse3
> -# define MEMCPY_CHK    __memcpy_chk_ssse3
> -# define MEMPCPY       __mempcpy_ssse3
> -# define MEMPCPY_CHK   __mempcpy_chk_ssse3
> -#endif
> -
> -#define JMPTBL(I, B)   I - B
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -   relative offsets.  INDEX is a register contains the index into the
> -   jump table.  SCALE is the scale of INDEX.  */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
> -  lea          TABLE(%rip), %r11;                              \
> -  movslq       (%r11, INDEX, SCALE), INDEX;                    \
> -  lea          (%r11, INDEX), INDEX;                           \
> -  _CET_NOTRACK jmp *INDEX;                                     \
> -  ud2
> -
> -       .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> -       mov     %RDI_LP, %RAX_LP
> -       add     %RDX_LP, %RAX_LP
> -       jmp     L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> -       mov     %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> -       add     %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> -       cmp     %rsi, %rdi
> -       jb      L(copy_forward)
> -       je      L(write_0bytes)
> -       cmp     $79, %rdx
> -       jbe     L(copy_forward)
> -       jmp     L(copy_backward)
> -L(copy_forward):
> -#endif
> -L(start):
> -       cmp     $79, %rdx
> -       lea     L(table_less_80bytes)(%rip), %r11
> -       ja      L(80bytesormore)
> -       movslq  (%r11, %rdx, 4), %r9
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(80bytesormore):
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jle     L(copy_backward)
> -#endif
> -
> -       movdqu  (%rsi), %xmm0
> -       mov     %rdi, %rcx
> -       and     $-16, %rdi
> -       add     $16, %rdi
> -       mov     %rcx, %r8
> -       sub     %rdi, %rcx
> -       add     %rcx, %rdx
> -       sub     %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -       cmp     %rcx, %rdx
> -       mov     %rsi, %r9
> -       ja      L(large_page_fwd)
> -       and     $0xf, %r9
> -       jz      L(shl_0)
> -#ifdef DATA_CACHE_SIZE_HALF
> -       mov     $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> -       BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
> -
> -       .p2align 4
> -L(copy_backward):
> -       movdqu  -16(%rsi, %rdx), %xmm0
> -       add     %rdx, %rsi
> -       lea     -16(%rdi, %rdx), %r8
> -       add     %rdx, %rdi
> -
> -       mov     %rdi, %rcx
> -       and     $0xf, %rcx
> -       xor     %rcx, %rdi
> -       sub     %rcx, %rdx
> -       sub     %rcx, %rsi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -
> -       cmp     %rcx, %rdx
> -       mov     %rsi, %r9
> -       ja      L(large_page_bwd)
> -       and     $0xf, %r9
> -       jz      L(shl_0_bwd)
> -#ifdef DATA_CACHE_SIZE_HALF
> -       mov     $DATA_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size_half(%rip), %RCX_LP
> -#endif
> -       BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
> -
> -       .p2align 4
> -L(shl_0):
> -       sub     $16, %rdx
> -       movdqa  (%rsi), %xmm1
> -       add     $16, %rsi
> -       movdqa  %xmm1, (%rdi)
> -       add     $16, %rdi
> -       cmp     $128, %rdx
> -       movdqu  %xmm0, (%r8)
> -       ja      L(shl_0_gobble)
> -       cmp     $64, %rdx
> -       jb      L(shl_0_less_64bytes)
> -       movaps  (%rsi), %xmm4
> -       movaps  16(%rsi), %xmm1
> -       movaps  32(%rsi), %xmm2
> -       movaps  48(%rsi), %xmm3
> -       movaps  %xmm4, (%rdi)
> -       movaps  %xmm1, 16(%rdi)
> -       movaps  %xmm2, 32(%rdi)
> -       movaps  %xmm3, 48(%rdi)
> -       sub     $64, %rdx
> -       add     $64, %rsi
> -       add     $64, %rdi
> -L(shl_0_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble):
> -#ifdef DATA_CACHE_SIZE_HALF
> -       cmp     $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> -       lea     -128(%rdx), %rdx
> -       jae     L(shl_0_gobble_mem_loop)
> -L(shl_0_gobble_cache_loop):
> -       movdqa  (%rsi), %xmm4
> -       movaps  0x10(%rsi), %xmm1
> -       movaps  0x20(%rsi), %xmm2
> -       movaps  0x30(%rsi), %xmm3
> -
> -       movdqa  %xmm4, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -
> -       sub     $128, %rdx
> -       movaps  0x40(%rsi), %xmm4
> -       movaps  0x50(%rsi), %xmm5
> -       movaps  0x60(%rsi), %xmm6
> -       movaps  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -       movaps  %xmm4, 0x40(%rdi)
> -       movaps  %xmm5, 0x50(%rdi)
> -       movaps  %xmm6, 0x60(%rdi)
> -       movaps  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_cache_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_cache_less_64bytes)
> -
> -       movdqa  (%rsi), %xmm4
> -       sub     $0x40, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -
> -       movdqa  %xmm4, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -
> -       movdqa  0x20(%rsi), %xmm4
> -       movdqa  0x30(%rsi), %xmm1
> -       add     $0x40, %rsi
> -
> -       movdqa  %xmm4, 0x20(%rdi)
> -       movdqa  %xmm1, 0x30(%rdi)
> -       add     $0x40, %rdi
> -L(shl_0_cache_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_mem_loop):
> -       prefetcht0 0x1c0(%rsi)
> -       prefetcht0 0x280(%rsi)
> -
> -       movdqa  (%rsi), %xmm0
> -       movdqa  0x10(%rsi), %xmm1
> -       movdqa  0x20(%rsi), %xmm2
> -       movdqa  0x30(%rsi), %xmm3
> -       movdqa  0x40(%rsi), %xmm4
> -       movdqa  0x50(%rsi), %xmm5
> -       movdqa  0x60(%rsi), %xmm6
> -       movdqa  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -       sub     $0x80, %rdx
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       movdqa  %xmm2, 0x20(%rdi)
> -       movdqa  %xmm3, 0x30(%rdi)
> -       movdqa  %xmm4, 0x40(%rdi)
> -       movdqa  %xmm5, 0x50(%rdi)
> -       movdqa  %xmm6, 0x60(%rdi)
> -       movdqa  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_mem_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_mem_less_64bytes)
> -
> -       movdqa  (%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -
> -       movdqa  0x20(%rsi), %xmm0
> -       movdqa  0x30(%rsi), %xmm1
> -       add     $0x40, %rsi
> -
> -       movdqa  %xmm0, 0x20(%rdi)
> -       movdqa  %xmm1, 0x30(%rdi)
> -       add     $0x40, %rdi
> -L(shl_0_mem_less_64bytes):
> -       cmp     $0x20, %rdx
> -       jb      L(shl_0_mem_less_32bytes)
> -       movdqa  (%rsi), %xmm0
> -       sub     $0x20, %rdx
> -       movdqa  0x10(%rsi), %xmm1
> -       add     $0x20, %rsi
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       add     $0x20, %rdi
> -L(shl_0_mem_less_32bytes):
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_bwd):
> -       sub     $16, %rdx
> -       movdqa  -0x10(%rsi), %xmm1
> -       sub     $16, %rsi
> -       movdqa  %xmm1, -0x10(%rdi)
> -       sub     $16, %rdi
> -       cmp     $0x80, %rdx
> -       movdqu  %xmm0, (%r8)
> -       ja      L(shl_0_gobble_bwd)
> -       cmp     $64, %rdx
> -       jb      L(shl_0_less_64bytes_bwd)
> -       movaps  -0x10(%rsi), %xmm0
> -       movaps  -0x20(%rsi), %xmm1
> -       movaps  -0x30(%rsi), %xmm2
> -       movaps  -0x40(%rsi), %xmm3
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       sub     $64, %rdx
> -       sub     $0x40, %rsi
> -       sub     $0x40, %rdi
> -L(shl_0_less_64bytes_bwd):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_bwd):
> -#ifdef DATA_CACHE_SIZE_HALF
> -       cmp     $DATA_CACHE_SIZE_HALF, %RDX_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %RDX_LP
> -#endif
> -       lea     -128(%rdx), %rdx
> -       jae     L(shl_0_gobble_mem_bwd_loop)
> -L(shl_0_gobble_bwd_loop):
> -       movdqa  -0x10(%rsi), %xmm0
> -       movaps  -0x20(%rsi), %xmm1
> -       movaps  -0x30(%rsi), %xmm2
> -       movaps  -0x40(%rsi), %xmm3
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -
> -       sub     $0x80, %rdx
> -       movaps  -0x50(%rsi), %xmm4
> -       movaps  -0x60(%rsi), %xmm5
> -       movaps  -0x70(%rsi), %xmm6
> -       movaps  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -       movaps  %xmm4, -0x50(%rdi)
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  %xmm6, -0x70(%rdi)
> -       movaps  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_gobble_bwd_less_64bytes)
> -
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -
> -       movdqa  -0x30(%rsi), %xmm0
> -       movdqa  -0x40(%rsi), %xmm1
> -       sub     $0x40, %rsi
> -
> -       movdqa  %xmm0, -0x30(%rdi)
> -       movdqa  %xmm1, -0x40(%rdi)
> -       sub     $0x40, %rdi
> -L(shl_0_gobble_bwd_less_64bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_gobble_mem_bwd_loop):
> -       prefetcht0 -0x1c0(%rsi)
> -       prefetcht0 -0x280(%rsi)
> -       movdqa  -0x10(%rsi), %xmm0
> -       movdqa  -0x20(%rsi), %xmm1
> -       movdqa  -0x30(%rsi), %xmm2
> -       movdqa  -0x40(%rsi), %xmm3
> -       movdqa  -0x50(%rsi), %xmm4
> -       movdqa  -0x60(%rsi), %xmm5
> -       movdqa  -0x70(%rsi), %xmm6
> -       movdqa  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -       sub     $0x80, %rdx
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -       movdqa  %xmm2, -0x30(%rdi)
> -       movdqa  %xmm3, -0x40(%rdi)
> -       movdqa  %xmm4, -0x50(%rdi)
> -       movdqa  %xmm5, -0x60(%rdi)
> -       movdqa  %xmm6, -0x70(%rdi)
> -       movdqa  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -
> -       jae     L(shl_0_gobble_mem_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(shl_0_mem_bwd_less_64bytes)
> -
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x40, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -
> -       movdqa  -0x30(%rsi), %xmm0
> -       movdqa  -0x40(%rsi), %xmm1
> -       sub     $0x40, %rsi
> -
> -       movdqa  %xmm0, -0x30(%rdi)
> -       movdqa  %xmm1, -0x40(%rdi)
> -       sub     $0x40, %rdi
> -L(shl_0_mem_bwd_less_64bytes):
> -       cmp     $0x20, %rdx
> -       jb      L(shl_0_mem_bwd_less_32bytes)
> -       movdqa  -0x10(%rsi), %xmm0
> -       sub     $0x20, %rdx
> -       movdqa  -0x20(%rsi), %xmm1
> -       sub     $0x20, %rsi
> -       movdqa  %xmm0, -0x10(%rdi)
> -       movdqa  %xmm1, -0x20(%rdi)
> -       sub     $0x20, %rdi
> -L(shl_0_mem_bwd_less_32bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1):
> -       lea     (L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       jb      L(L1_fwd)
> -       lea     (L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
> -L(L1_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_1_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0f(%rsi), %xmm2
> -       movaps  0x1f(%rsi), %xmm3
> -       movaps  0x2f(%rsi), %xmm4
> -       movaps  0x3f(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $1, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $1, %xmm3, %xmm4
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $1, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_1_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1_bwd):
> -       lea     (L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       jb      L(L1_bwd)
> -       lea     (L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
> -L(L1_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_1_bwd_loop_L1):
> -       movaps  -0x11(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x21(%rsi), %xmm3
> -       movaps  -0x31(%rsi), %xmm4
> -       movaps  -0x41(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $1, %xmm2, %xmm1
> -       palignr $1, %xmm3, %xmm2
> -       palignr $1, %xmm4, %xmm3
> -       palignr $1, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_1_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_1_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2):
> -       lea     (L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       jb      L(L2_fwd)
> -       lea     (L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
> -L(L2_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_2_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0e(%rsi), %xmm2
> -       movaps  0x1e(%rsi), %xmm3
> -       movaps  0x2e(%rsi), %xmm4
> -       movaps  0x3e(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $2, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $2, %xmm3, %xmm4
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $2, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_2_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2_bwd):
> -       lea     (L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       jb      L(L2_bwd)
> -       lea     (L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
> -L(L2_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_2_bwd_loop_L1):
> -       movaps  -0x12(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x22(%rsi), %xmm3
> -       movaps  -0x32(%rsi), %xmm4
> -       movaps  -0x42(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $2, %xmm2, %xmm1
> -       palignr $2, %xmm3, %xmm2
> -       palignr $2, %xmm4, %xmm3
> -       palignr $2, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_2_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_2_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3):
> -       lea     (L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x03(%rsi), %xmm1
> -       jb      L(L3_fwd)
> -       lea     (L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
> -L(L3_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_3_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0d(%rsi), %xmm2
> -       movaps  0x1d(%rsi), %xmm3
> -       movaps  0x2d(%rsi), %xmm4
> -       movaps  0x3d(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $3, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $3, %xmm3, %xmm4
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $3, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_3_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3_bwd):
> -       lea     (L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x03(%rsi), %xmm1
> -       jb      L(L3_bwd)
> -       lea     (L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
> -L(L3_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_3_bwd_loop_L1):
> -       movaps  -0x13(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x23(%rsi), %xmm3
> -       movaps  -0x33(%rsi), %xmm4
> -       movaps  -0x43(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $3, %xmm2, %xmm1
> -       palignr $3, %xmm3, %xmm2
> -       palignr $3, %xmm4, %xmm3
> -       palignr $3, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_3_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_3_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4):
> -       lea     (L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       jb      L(L4_fwd)
> -       lea     (L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
> -L(L4_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_4_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0c(%rsi), %xmm2
> -       movaps  0x1c(%rsi), %xmm3
> -       movaps  0x2c(%rsi), %xmm4
> -       movaps  0x3c(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $4, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $4, %xmm3, %xmm4
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $4, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_4_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4_bwd):
> -       lea     (L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       jb      L(L4_bwd)
> -       lea     (L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
> -L(L4_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_4_bwd_loop_L1):
> -       movaps  -0x14(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x24(%rsi), %xmm3
> -       movaps  -0x34(%rsi), %xmm4
> -       movaps  -0x44(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $4, %xmm2, %xmm1
> -       palignr $4, %xmm3, %xmm2
> -       palignr $4, %xmm4, %xmm3
> -       palignr $4, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_4_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_4_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5):
> -       lea     (L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       jb      L(L5_fwd)
> -       lea     (L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
> -L(L5_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_5_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0b(%rsi), %xmm2
> -       movaps  0x1b(%rsi), %xmm3
> -       movaps  0x2b(%rsi), %xmm4
> -       movaps  0x3b(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $5, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $5, %xmm3, %xmm4
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $5, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_5_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5_bwd):
> -       lea     (L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       jb      L(L5_bwd)
> -       lea     (L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
> -L(L5_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_5_bwd_loop_L1):
> -       movaps  -0x15(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x25(%rsi), %xmm3
> -       movaps  -0x35(%rsi), %xmm4
> -       movaps  -0x45(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $5, %xmm2, %xmm1
> -       palignr $5, %xmm3, %xmm2
> -       palignr $5, %xmm4, %xmm3
> -       palignr $5, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_5_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_5_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6):
> -       lea     (L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       jb      L(L6_fwd)
> -       lea     (L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
> -L(L6_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_6_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x0a(%rsi), %xmm2
> -       movaps  0x1a(%rsi), %xmm3
> -       movaps  0x2a(%rsi), %xmm4
> -       movaps  0x3a(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $6, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $6, %xmm3, %xmm4
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $6, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_6_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6_bwd):
> -       lea     (L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       jb      L(L6_bwd)
> -       lea     (L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
> -L(L6_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_6_bwd_loop_L1):
> -       movaps  -0x16(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x26(%rsi), %xmm3
> -       movaps  -0x36(%rsi), %xmm4
> -       movaps  -0x46(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $6, %xmm2, %xmm1
> -       palignr $6, %xmm3, %xmm2
> -       palignr $6, %xmm4, %xmm3
> -       palignr $6, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_6_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_6_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7):
> -       lea     (L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       jb      L(L7_fwd)
> -       lea     (L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
> -L(L7_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_7_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x09(%rsi), %xmm2
> -       movaps  0x19(%rsi), %xmm3
> -       movaps  0x29(%rsi), %xmm4
> -       movaps  0x39(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $7, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $7, %xmm3, %xmm4
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $7, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_7_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7_bwd):
> -       lea     (L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       jb      L(L7_bwd)
> -       lea     (L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
> -L(L7_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_7_bwd_loop_L1):
> -       movaps  -0x17(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x27(%rsi), %xmm3
> -       movaps  -0x37(%rsi), %xmm4
> -       movaps  -0x47(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $7, %xmm2, %xmm1
> -       palignr $7, %xmm3, %xmm2
> -       palignr $7, %xmm4, %xmm3
> -       palignr $7, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_7_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_7_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8):
> -       lea     (L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       jb      L(L8_fwd)
> -       lea     (L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
> -L(L8_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -L(shl_8_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_8_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x08(%rsi), %xmm2
> -       movaps  0x18(%rsi), %xmm3
> -       movaps  0x28(%rsi), %xmm4
> -       movaps  0x38(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $8, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $8, %xmm3, %xmm4
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $8, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_8_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -       .p2align 4
> -L(shl_8_end):
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm4, -0x20(%rdi)
> -       add     %rdx, %rsi
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8_bwd):
> -       lea     (L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       jb      L(L8_bwd)
> -       lea     (L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
> -L(L8_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_8_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_8_bwd_loop_L1):
> -       movaps  -0x18(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x28(%rsi), %xmm3
> -       movaps  -0x38(%rsi), %xmm4
> -       movaps  -0x48(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $8, %xmm2, %xmm1
> -       palignr $8, %xmm3, %xmm2
> -       palignr $8, %xmm4, %xmm3
> -       palignr $8, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_8_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_8_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9):
> -       lea     (L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       jb      L(L9_fwd)
> -       lea     (L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
> -L(L9_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_9_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x07(%rsi), %xmm2
> -       movaps  0x17(%rsi), %xmm3
> -       movaps  0x27(%rsi), %xmm4
> -       movaps  0x37(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $9, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $9, %xmm3, %xmm4
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $9, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_9_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9_bwd):
> -       lea     (L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       jb      L(L9_bwd)
> -       lea     (L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
> -L(L9_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_9_bwd_loop_L1):
> -       movaps  -0x19(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x29(%rsi), %xmm3
> -       movaps  -0x39(%rsi), %xmm4
> -       movaps  -0x49(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $9, %xmm2, %xmm1
> -       palignr $9, %xmm3, %xmm2
> -       palignr $9, %xmm4, %xmm3
> -       palignr $9, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_9_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_9_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10):
> -       lea     (L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       jb      L(L10_fwd)
> -       lea     (L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
> -L(L10_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_10_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x06(%rsi), %xmm2
> -       movaps  0x16(%rsi), %xmm3
> -       movaps  0x26(%rsi), %xmm4
> -       movaps  0x36(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $10, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $10, %xmm3, %xmm4
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $10, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_10_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10_bwd):
> -       lea     (L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       jb      L(L10_bwd)
> -       lea     (L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
> -L(L10_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_10_bwd_loop_L1):
> -       movaps  -0x1a(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2a(%rsi), %xmm3
> -       movaps  -0x3a(%rsi), %xmm4
> -       movaps  -0x4a(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $10, %xmm2, %xmm1
> -       palignr $10, %xmm3, %xmm2
> -       palignr $10, %xmm4, %xmm3
> -       palignr $10, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_10_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_10_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11):
> -       lea     (L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       jb      L(L11_fwd)
> -       lea     (L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
> -L(L11_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_11_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x05(%rsi), %xmm2
> -       movaps  0x15(%rsi), %xmm3
> -       movaps  0x25(%rsi), %xmm4
> -       movaps  0x35(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $11, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $11, %xmm3, %xmm4
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $11, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_11_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11_bwd):
> -       lea     (L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       jb      L(L11_bwd)
> -       lea     (L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
> -L(L11_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_11_bwd_loop_L1):
> -       movaps  -0x1b(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2b(%rsi), %xmm3
> -       movaps  -0x3b(%rsi), %xmm4
> -       movaps  -0x4b(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $11, %xmm2, %xmm1
> -       palignr $11, %xmm3, %xmm2
> -       palignr $11, %xmm4, %xmm3
> -       palignr $11, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_11_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_11_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12):
> -       lea     (L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0c(%rsi), %xmm1
> -       jb      L(L12_fwd)
> -       lea     (L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
> -L(L12_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_12_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x04(%rsi), %xmm2
> -       movaps  0x14(%rsi), %xmm3
> -       movaps  0x24(%rsi), %xmm4
> -       movaps  0x34(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $12, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $12, %xmm3, %xmm4
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $12, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_12_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12_bwd):
> -       lea     (L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0c(%rsi), %xmm1
> -       jb      L(L12_bwd)
> -       lea     (L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
> -L(L12_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_12_bwd_loop_L1):
> -       movaps  -0x1c(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2c(%rsi), %xmm3
> -       movaps  -0x3c(%rsi), %xmm4
> -       movaps  -0x4c(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $12, %xmm2, %xmm1
> -       palignr $12, %xmm3, %xmm2
> -       palignr $12, %xmm4, %xmm3
> -       palignr $12, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_12_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_12_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13):
> -       lea     (L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       jb      L(L13_fwd)
> -       lea     (L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
> -L(L13_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_13_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x03(%rsi), %xmm2
> -       movaps  0x13(%rsi), %xmm3
> -       movaps  0x23(%rsi), %xmm4
> -       movaps  0x33(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $13, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $13, %xmm3, %xmm4
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $13, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_13_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13_bwd):
> -       lea     (L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       jb      L(L13_bwd)
> -       lea     (L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
> -L(L13_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_13_bwd_loop_L1):
> -       movaps  -0x1d(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2d(%rsi), %xmm3
> -       movaps  -0x3d(%rsi), %xmm4
> -       movaps  -0x4d(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $13, %xmm2, %xmm1
> -       palignr $13, %xmm3, %xmm2
> -       palignr $13, %xmm4, %xmm3
> -       palignr $13, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_13_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_13_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14):
> -       lea     (L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       jb      L(L14_fwd)
> -       lea     (L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
> -L(L14_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_14_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x02(%rsi), %xmm2
> -       movaps  0x12(%rsi), %xmm3
> -       movaps  0x22(%rsi), %xmm4
> -       movaps  0x32(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $14, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $14, %xmm3, %xmm4
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $14, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_14_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14_bwd):
> -       lea     (L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       jb      L(L14_bwd)
> -       lea     (L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
> -L(L14_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_14_bwd_loop_L1):
> -       movaps  -0x1e(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2e(%rsi), %xmm3
> -       movaps  -0x3e(%rsi), %xmm4
> -       movaps  -0x4e(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $14, %xmm2, %xmm1
> -       palignr $14, %xmm3, %xmm2
> -       palignr $14, %xmm4, %xmm3
> -       palignr $14, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_14_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_14_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15):
> -       lea     (L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       jb      L(L15_fwd)
> -       lea     (L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
> -L(L15_fwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_loop_L2):
> -       prefetchnta 0x1c0(%rsi)
> -L(shl_15_loop_L1):
> -       sub     $64, %rdx
> -       movaps  0x01(%rsi), %xmm2
> -       movaps  0x11(%rsi), %xmm3
> -       movaps  0x21(%rsi), %xmm4
> -       movaps  0x31(%rsi), %xmm5
> -       movdqa  %xmm5, %xmm6
> -       palignr $15, %xmm4, %xmm5
> -       lea     64(%rsi), %rsi
> -       palignr $15, %xmm3, %xmm4
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rdi), %rdi
> -       palignr $15, %xmm1, %xmm2
> -       movdqa  %xmm6, %xmm1
> -       movdqa  %xmm2, -0x40(%rdi)
> -       movaps  %xmm3, -0x30(%rdi)
> -       jb      L(shl_15_end)
> -       movaps  %xmm4, -0x20(%rdi)
> -       movaps  %xmm5, -0x10(%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_end):
> -       movaps  %xmm4, -0x20(%rdi)
> -       lea     64(%rdx), %rdx
> -       movaps  %xmm5, -0x10(%rdi)
> -       add     %rdx, %rdi
> -       movdqu  %xmm0, (%r8)
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15_bwd):
> -       lea     (L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
> -       cmp     %rcx, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       jb      L(L15_bwd)
> -       lea     (L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
> -L(L15_bwd):
> -       lea     -64(%rdx), %rdx
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_bwd_loop_L2):
> -       prefetchnta -0x1c0(%rsi)
> -L(shl_15_bwd_loop_L1):
> -       movaps  -0x1f(%rsi), %xmm2
> -       sub     $0x40, %rdx
> -       movaps  -0x2f(%rsi), %xmm3
> -       movaps  -0x3f(%rsi), %xmm4
> -       movaps  -0x4f(%rsi), %xmm5
> -       lea     -0x40(%rsi), %rsi
> -       palignr $15, %xmm2, %xmm1
> -       palignr $15, %xmm3, %xmm2
> -       palignr $15, %xmm4, %xmm3
> -       palignr $15, %xmm5, %xmm4
> -
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  %xmm5, %xmm1
> -
> -       movaps  %xmm2, -0x20(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -
> -       movaps  %xmm3, 0x10(%rdi)
> -       jb      L(shl_15_bwd_end)
> -       movaps  %xmm4, (%rdi)
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -L(shl_15_bwd_end):
> -       movaps  %xmm4, (%rdi)
> -       lea     64(%rdx), %rdx
> -       movdqu  %xmm0, (%r8)
> -       BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
> -
> -       .p2align 4
> -L(write_72bytes):
> -       movdqu  -72(%rsi), %xmm0
> -       movdqu  -56(%rsi), %xmm1
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rcx
> -       movdqu   %xmm0, -72(%rdi)
> -       movdqu   %xmm1, -56(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_64bytes):
> -       movdqu  -64(%rsi), %xmm0
> -       mov     -48(%rsi), %rcx
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -64(%rdi)
> -       mov      %rcx, -48(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_56bytes):
> -       movdqu  -56(%rsi), %xmm0
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rcx
> -       movdqu   %xmm0, -56(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_48bytes):
> -       mov     -48(%rsi), %rcx
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -48(%rdi)
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_40bytes):
> -       mov     -40(%rsi), %r8
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -40(%rdi)
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_32bytes):
> -       mov     -32(%rsi), %r9
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -32(%rdi)
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_24bytes):
> -       mov     -24(%rsi), %r10
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -24(%rdi)
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_16bytes):
> -       mov     -16(%rsi), %r11
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -16(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_8bytes):
> -       mov     -8(%rsi), %rdx
> -       mov      %rdx, -8(%rdi)
> -L(write_0bytes):
> -       ret
> -
> -       .p2align 4
> -L(write_73bytes):
> -       movdqu  -73(%rsi), %xmm0
> -       movdqu  -57(%rsi), %xmm1
> -       mov     -41(%rsi), %rcx
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %r8
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -73(%rdi)
> -       movdqu   %xmm1, -57(%rdi)
> -       mov      %rcx, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %r8, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_65bytes):
> -       movdqu  -65(%rsi), %xmm0
> -       movdqu  -49(%rsi), %xmm1
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -65(%rdi)
> -       movdqu   %xmm1, -49(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_57bytes):
> -       movdqu  -57(%rsi), %xmm0
> -       mov     -41(%rsi), %r8
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -57(%rdi)
> -       mov      %r8, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_49bytes):
> -       movdqu  -49(%rsi), %xmm0
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -49(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_41bytes):
> -       mov     -41(%rsi), %r8
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r8, -41(%rdi)
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_33bytes):
> -       mov     -33(%rsi), %r9
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r9, -33(%rdi)
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_25bytes):
> -       mov     -25(%rsi), %r10
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -1(%rsi), %dl
> -       mov      %r10, -25(%rdi)
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_17bytes):
> -       mov     -17(%rsi), %r11
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -17(%rdi)
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_9bytes):
> -       mov     -9(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -9(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_1bytes):
> -       mov     -1(%rsi), %dl
> -       mov      %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_74bytes):
> -       movdqu  -74(%rsi), %xmm0
> -       movdqu  -58(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -74(%rdi)
> -       movdqu   %xmm1, -58(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_66bytes):
> -       movdqu  -66(%rsi), %xmm0
> -       movdqu  -50(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -66(%rdi)
> -       movdqu   %xmm1, -50(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_58bytes):
> -       movdqu  -58(%rsi), %xmm1
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm1, -58(%rdi)
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_50bytes):
> -       movdqu  -50(%rsi), %xmm0
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -50(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_42bytes):
> -       mov     -42(%rsi), %r8
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -42(%rdi)
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_34bytes):
> -       mov     -34(%rsi), %r9
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -34(%rdi)
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_26bytes):
> -       mov     -26(%rsi), %r10
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -26(%rdi)
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_18bytes):
> -       mov     -18(%rsi), %r11
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -18(%rdi)
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_10bytes):
> -       mov     -10(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -10(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_2bytes):
> -       mov     -2(%rsi), %dx
> -       mov      %dx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_75bytes):
> -       movdqu  -75(%rsi), %xmm0
> -       movdqu  -59(%rsi), %xmm1
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -75(%rdi)
> -       movdqu   %xmm1, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_67bytes):
> -       movdqu  -67(%rsi), %xmm0
> -       movdqu  -59(%rsi), %xmm1
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -67(%rdi)
> -       movdqu   %xmm1, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_59bytes):
> -       movdqu  -59(%rsi), %xmm0
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -59(%rdi)
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_51bytes):
> -       movdqu  -51(%rsi), %xmm0
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -51(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_43bytes):
> -       mov     -43(%rsi), %r8
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -43(%rdi)
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_35bytes):
> -       mov     -35(%rsi), %r9
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -35(%rdi)
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_27bytes):
> -       mov     -27(%rsi), %r10
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -27(%rdi)
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_19bytes):
> -       mov     -19(%rsi), %r11
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -19(%rdi)
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_11bytes):
> -       mov     -11(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -11(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_3bytes):
> -       mov     -3(%rsi), %dx
> -       mov     -2(%rsi), %cx
> -       mov      %dx, -3(%rdi)
> -       mov      %cx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_76bytes):
> -       movdqu  -76(%rsi), %xmm0
> -       movdqu  -60(%rsi), %xmm1
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -76(%rdi)
> -       movdqu   %xmm1, -60(%rdi)
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_68bytes):
> -       movdqu  -68(%rsi), %xmm0
> -       movdqu  -52(%rsi), %xmm1
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -68(%rdi)
> -       movdqu   %xmm1, -52(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_60bytes):
> -       movdqu  -60(%rsi), %xmm0
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -60(%rdi)
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_52bytes):
> -       movdqu  -52(%rsi), %xmm0
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       movdqu   %xmm0, -52(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_44bytes):
> -       mov     -44(%rsi), %r8
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r8, -44(%rdi)
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_36bytes):
> -       mov     -36(%rsi), %r9
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r9, -36(%rdi)
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_28bytes):
> -       mov     -28(%rsi), %r10
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r10, -28(%rdi)
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_20bytes):
> -       mov     -20(%rsi), %r11
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %r11, -20(%rdi)
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_12bytes):
> -       mov     -12(%rsi), %rcx
> -       mov     -4(%rsi), %edx
> -       mov      %rcx, -12(%rdi)
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_4bytes):
> -       mov     -4(%rsi), %edx
> -       mov      %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_77bytes):
> -       movdqu  -77(%rsi), %xmm0
> -       movdqu  -61(%rsi), %xmm1
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -77(%rdi)
> -       movdqu   %xmm1, -61(%rdi)
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_69bytes):
> -       movdqu  -69(%rsi), %xmm0
> -       movdqu  -53(%rsi), %xmm1
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -69(%rdi)
> -       movdqu   %xmm1, -53(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_61bytes):
> -       movdqu  -61(%rsi), %xmm0
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -61(%rdi)
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_53bytes):
> -       movdqu  -53(%rsi), %xmm0
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -53(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_45bytes):
> -       mov     -45(%rsi), %r8
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -45(%rdi)
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_37bytes):
> -       mov     -37(%rsi), %r9
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -37(%rdi)
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_29bytes):
> -       mov     -29(%rsi), %r10
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -29(%rdi)
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_21bytes):
> -       mov     -21(%rsi), %r11
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -21(%rdi)
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_13bytes):
> -       mov     -13(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -13(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_5bytes):
> -       mov     -5(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -5(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_78bytes):
> -       movdqu  -78(%rsi), %xmm0
> -       movdqu  -62(%rsi), %xmm1
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -78(%rdi)
> -       movdqu   %xmm1, -62(%rdi)
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_70bytes):
> -       movdqu  -70(%rsi), %xmm0
> -       movdqu  -54(%rsi), %xmm1
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -70(%rdi)
> -       movdqu   %xmm1, -54(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_62bytes):
> -       movdqu  -62(%rsi), %xmm0
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -62(%rdi)
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_54bytes):
> -       movdqu  -54(%rsi), %xmm0
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -54(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_46bytes):
> -       mov     -46(%rsi), %r8
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -46(%rdi)
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_38bytes):
> -       mov     -38(%rsi), %r9
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -38(%rdi)
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_30bytes):
> -       mov     -30(%rsi), %r10
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -30(%rdi)
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_22bytes):
> -       mov     -22(%rsi), %r11
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -22(%rdi)
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_14bytes):
> -       mov     -14(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -14(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_6bytes):
> -       mov     -6(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -6(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_79bytes):
> -       movdqu  -79(%rsi), %xmm0
> -       movdqu  -63(%rsi), %xmm1
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -79(%rdi)
> -       movdqu   %xmm1, -63(%rdi)
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_71bytes):
> -       movdqu  -71(%rsi), %xmm0
> -       movdqu  -55(%rsi), %xmm1
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -71(%rdi)
> -       movdqu   %xmm1, -55(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_63bytes):
> -       movdqu  -63(%rsi), %xmm0
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -63(%rdi)
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_55bytes):
> -       movdqu  -55(%rsi), %xmm0
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       movdqu   %xmm0, -55(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_47bytes):
> -       mov     -47(%rsi), %r8
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r8, -47(%rdi)
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_39bytes):
> -       mov     -39(%rsi), %r9
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r9, -39(%rdi)
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_31bytes):
> -       mov     -31(%rsi), %r10
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r10, -31(%rdi)
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_23bytes):
> -       mov     -23(%rsi), %r11
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %r11, -23(%rdi)
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_15bytes):
> -       mov     -15(%rsi), %rcx
> -       mov     -8(%rsi), %rdx
> -       mov      %rcx, -15(%rdi)
> -       mov      %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(write_7bytes):
> -       mov     -7(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov      %edx, -7(%rdi)
> -       mov      %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(large_page_fwd):
> -       movdqu  (%rsi), %xmm1
> -       lea     16(%rsi), %rsi
> -       movdqu  %xmm0, (%r8)
> -       movntdq %xmm1, (%rdi)
> -       lea     16(%rdi), %rdi
> -       lea     -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rsi, %r9
> -       sub     %rdi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_fwd)
> -       shl     $2, %rcx
> -       cmp     %rcx, %rdx
> -       jb      L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -L(large_page_loop):
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       movntdq %xmm4, 0x40(%rdi)
> -       movntdq %xmm5, 0x50(%rdi)
> -       movntdq %xmm6, 0x60(%rdi)
> -       movntdq %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(large_page_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_less_64bytes)
> -
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       lea     0x40(%rsi), %rsi
> -
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       lea     0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_less_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       sfence
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> -       .p2align 4
> -L(ll_cache_copy_fwd_start):
> -       prefetcht0 0x1c0(%rsi)
> -       prefetcht0 0x200(%rsi)
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lea     0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movaps  %xmm0, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -       movaps  %xmm4, 0x40(%rdi)
> -       movaps  %xmm5, 0x50(%rdi)
> -       movaps  %xmm6, 0x60(%rdi)
> -       movaps  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(ll_cache_copy_fwd_start)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_ll_less_fwd_64bytes)
> -
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       lea     0x40(%rsi), %rsi
> -
> -       movaps  %xmm0, (%rdi)
> -       movaps  %xmm1, 0x10(%rdi)
> -       movaps  %xmm2, 0x20(%rdi)
> -       movaps  %xmm3, 0x30(%rdi)
> -       lea     0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_ll_less_fwd_64bytes):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#endif
> -       .p2align 4
> -L(large_page_bwd):
> -       movdqu  -0x10(%rsi), %xmm1
> -       lea     -16(%rsi), %rsi
> -       movdqu  %xmm0, (%r8)
> -       movdqa  %xmm1, -0x10(%rdi)
> -       lea     -16(%rdi), %rdi
> -       lea     -0x90(%rdx), %rdx
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rdi, %r9
> -       sub     %rsi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_bwd)
> -       cmp     %rcx, %r9
> -       jb      L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -L(large_page_bwd_loop):
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       movdqu  -0x50(%rsi), %xmm4
> -       movdqu  -0x60(%rsi), %xmm5
> -       movdqu  -0x70(%rsi), %xmm6
> -       movdqu  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movntdq %xmm0, -0x10(%rdi)
> -       movntdq %xmm1, -0x20(%rdi)
> -       movntdq %xmm2, -0x30(%rdi)
> -       movntdq %xmm3, -0x40(%rdi)
> -       movntdq %xmm4, -0x50(%rdi)
> -       movntdq %xmm5, -0x60(%rdi)
> -       movntdq %xmm6, -0x70(%rdi)
> -       movntdq %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(large_page_bwd_loop)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_less_bwd_64bytes)
> -
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       lea     -0x40(%rsi), %rsi
> -
> -       movntdq %xmm0, -0x10(%rdi)
> -       movntdq %xmm1, -0x20(%rdi)
> -       movntdq %xmm2, -0x30(%rdi)
> -       movntdq %xmm3, -0x40(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_less_bwd_64bytes):
> -       sfence
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -
> -#ifdef USE_AS_MEMMOVE
> -       .p2align 4
> -L(ll_cache_copy_bwd_start):
> -       prefetcht0 -0x1c0(%rsi)
> -       prefetcht0 -0x200(%rsi)
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       movdqu  -0x50(%rsi), %xmm4
> -       movdqu  -0x60(%rsi), %xmm5
> -       movdqu  -0x70(%rsi), %xmm6
> -       movdqu  -0x80(%rsi), %xmm7
> -       lea     -0x80(%rsi), %rsi
> -
> -       sub     $0x80, %rdx
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       movaps  %xmm4, -0x50(%rdi)
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  %xmm6, -0x70(%rdi)
> -       movaps  %xmm7, -0x80(%rdi)
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(ll_cache_copy_bwd_start)
> -       cmp     $-0x40, %rdx
> -       lea     0x80(%rdx), %rdx
> -       jl      L(large_page_ll_less_bwd_64bytes)
> -
> -       movdqu  -0x10(%rsi), %xmm0
> -       movdqu  -0x20(%rsi), %xmm1
> -       movdqu  -0x30(%rsi), %xmm2
> -       movdqu  -0x40(%rsi), %xmm3
> -       lea     -0x40(%rsi), %rsi
> -
> -       movaps  %xmm0, -0x10(%rdi)
> -       movaps  %xmm1, -0x20(%rdi)
> -       movaps  %xmm2, -0x30(%rdi)
> -       movaps  %xmm3, -0x40(%rdi)
> -       lea     -0x40(%rdi), %rdi
> -       sub     $0x40, %rdx
> -L(large_page_ll_less_bwd_64bytes):
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
> -#endif
> -
> -END (MEMCPY)
> -
> -       .section .rodata.ssse3,"a",@progbits
> -       .p2align 3
> -L(table_less_80bytes):
> -       .int    JMPTBL (L(write_0bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_1bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_2bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_3bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_4bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_5bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_6bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_7bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_8bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_9bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_10bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_11bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_12bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_13bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_14bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_15bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_16bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_17bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_18bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_19bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_20bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_21bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_22bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_23bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_24bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_25bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_26bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_27bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_28bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_29bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_30bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_31bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_32bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_33bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_34bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_35bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_36bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_37bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_38bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_39bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_40bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_41bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_42bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_43bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_44bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_45bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_46bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_47bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_48bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_49bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_50bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_51bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_52bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_53bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_54bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_55bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_56bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_57bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_58bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_59bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_60bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_61bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_62bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_63bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_64bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_65bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_66bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_67bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_68bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_69bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_70bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_71bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_72bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_73bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_74bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_75bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_76bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_77bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_78bytes), L(table_less_80bytes))
> -       .int    JMPTBL (L(write_79bytes), L(table_less_80bytes))
> -
> -       .p2align 3
> -L(shl_table):
> -       .int    JMPTBL (L(shl_0), L(shl_table))
> -       .int    JMPTBL (L(shl_1), L(shl_table))
> -       .int    JMPTBL (L(shl_2), L(shl_table))
> -       .int    JMPTBL (L(shl_3), L(shl_table))
> -       .int    JMPTBL (L(shl_4), L(shl_table))
> -       .int    JMPTBL (L(shl_5), L(shl_table))
> -       .int    JMPTBL (L(shl_6), L(shl_table))
> -       .int    JMPTBL (L(shl_7), L(shl_table))
> -       .int    JMPTBL (L(shl_8), L(shl_table))
> -       .int    JMPTBL (L(shl_9), L(shl_table))
> -       .int    JMPTBL (L(shl_10), L(shl_table))
> -       .int    JMPTBL (L(shl_11), L(shl_table))
> -       .int    JMPTBL (L(shl_12), L(shl_table))
> -       .int    JMPTBL (L(shl_13), L(shl_table))
> -       .int    JMPTBL (L(shl_14), L(shl_table))
> -       .int    JMPTBL (L(shl_15), L(shl_table))
> -
> -       .p2align 3
> -L(shl_table_bwd):
> -       .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
> deleted file mode 100644
> index 295430b1ef..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY         __memmove_ssse3
> -#define MEMCPY_CHK     __memmove_chk_ssse3
> -#include "memcpy-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.


-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
  2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
  2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
  2022-03-25 19:56   ` H.J. Lu
  2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
                   ` (3 subsequent siblings)
  6 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile             |    2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
 sysdeps/x86_64/multiarch/ifunc-memmove.h      |    7 -
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
 sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
 5 files changed, 3209 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 48f81711ae..323be3b969 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,14 +16,12 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3-back \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
   memmove-avx512-unaligned-erms \
   memmove-evex-unaligned-erms \
   memmove-sse2-unaligned-erms \
-  memmove-ssse3-back \
   memrchr-avx2 \
   memrchr-avx2-rtm \
   memrchr-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 70b0e9c62e..d6852ab365 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
 			      __memmove_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
@@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
-			      __memmove_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1,
 			      __memmove_sse2_unaligned)
@@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
 			      __memcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
@@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
@@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
-			      CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_chk_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
 			      __mempcpy_chk_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
@@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_evex_unaligned_erms)
-	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 1ecdd4b0d3..5596ddea2c 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
 	}
     }
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
-      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
-    {
-      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
-        return OPTIMIZE (ssse3_back);
-    }
-
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	return OPTIMIZE (sse2_unaligned_erms);
 
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
deleted file mode 100644
index 92cfbf7933..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ /dev/null
@@ -1,3181 +0,0 @@
-/* memcpy with SSSE3 and REP string
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3_back
-# define MEMCPY_CHK	__memcpy_chk_ssse3_back
-# define MEMPCPY	__mempcpy_ssse3_back
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3_back
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(bwd_write_0bytes)
-	cmp	$144, %rdx
-	jae	L(copy_backward)
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$144, %rdx
-	jae	L(144bytesormore)
-
-L(fwd_write_less32bytes):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jbe	L(bk_write)
-#endif
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-#ifndef USE_AS_MEMMOVE
-L(bk_write):
-
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-#endif
-
-	.p2align 4
-L(144bytesormore):
-
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %r8
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rdi, %r9
-	sub	%r8, %r9
-	sub	%r9, %rdx
-	add	%r9, %rsi
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	jae	L(gobble_mem_fwd)
-	lea    	L(shl_table_fwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(copy_backward):
-#ifdef DATA_CACHE_SIZE
-	mov	$DATA_CACHE_SIZE, %RCX_LP
-#else
-	mov	__x86_data_cache_size(%rip), %RCX_LP
-#endif
-	shl	$1, %rcx
-	cmp	%rcx, %rdx
-	ja	L(gobble_mem_bwd)
-
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$0xf, %r9
-	xor	%r9, %rdi
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-	mov	%rsi, %r9
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-	lea    	L(shl_table_bwd)(%rip), %r11
-	sub	$0x80, %rdx
-	movslq	(%r11, %r9, 4), %r9
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(shl_0):
-
-	mov	%rdx, %r9
-	shr	$8, %r9
-	add	%rdx, %r9
-#ifdef DATA_CACHE_SIZE
-	cmp	$DATA_CACHE_SIZE_HALF, %R9_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %R9_LP
-#endif
-	jae	L(gobble_mem_fwd)
-	sub	$0x80, %rdx
-	.p2align 4
-L(shl_0_loop):
-	movdqa	(%rsi), %xmm1
-	movdqa	%xmm1, (%rdi)
-	movaps	0x10(%rsi), %xmm2
-	movaps	%xmm2, 0x10(%rdi)
-	movaps	0x20(%rsi), %xmm3
-	movaps	%xmm3, 0x20(%rdi)
-	movaps	0x30(%rsi), %xmm4
-	movaps	%xmm4, 0x30(%rdi)
-	movaps	0x40(%rsi), %xmm1
-	movaps	%xmm1, 0x40(%rdi)
-	movaps	0x50(%rsi), %xmm2
-	movaps	%xmm2, 0x50(%rdi)
-	movaps	0x60(%rsi), %xmm3
-	movaps	%xmm3, 0x60(%rdi)
-	movaps	0x70(%rsi), %xmm4
-	movaps	%xmm4, 0x70(%rdi)
-	sub	$0x80, %rdx
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_0_loop)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$0x80, %rdx
-L(copy_backward_loop):
-	movaps	-0x10(%rsi), %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	-0x20(%rsi), %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-	movaps	-0x30(%rsi), %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-	movaps	-0x40(%rsi), %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-	movaps	-0x50(%rsi), %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-	movaps	-0x60(%rsi), %xmm5
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	-0x70(%rsi), %xmm5
-	movaps	%xmm5, -0x70(%rdi)
-	movaps	-0x80(%rsi), %xmm5
-	movaps	%xmm5, -0x80(%rdi)
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(copy_backward_loop)
-
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	sub	$0x80, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movaps	0x4f(%rsi), %xmm6
-	movaps	0x5f(%rsi), %xmm7
-	movaps	0x6f(%rsi), %xmm8
-	movaps	0x7f(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$1, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$1, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$1, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$1, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$1, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$1, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$1, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_1)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	movaps	-0x01(%rsi), %xmm1
-
-	movaps	-0x11(%rsi), %xmm2
-	palignr	$1, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x21(%rsi), %xmm3
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x31(%rsi), %xmm4
-	palignr	$1, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x41(%rsi), %xmm5
-	palignr	$1, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x51(%rsi), %xmm6
-	palignr	$1, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x61(%rsi), %xmm7
-	palignr	$1, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x71(%rsi), %xmm8
-	palignr	$1, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x81(%rsi), %xmm9
-	palignr	$1, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_1_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	sub	$0x80, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movaps	0x4e(%rsi), %xmm6
-	movaps	0x5e(%rsi), %xmm7
-	movaps	0x6e(%rsi), %xmm8
-	movaps	0x7e(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$2, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$2, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$2, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$2, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$2, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$2, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$2, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_2)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	movaps	-0x02(%rsi), %xmm1
-
-	movaps	-0x12(%rsi), %xmm2
-	palignr	$2, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x22(%rsi), %xmm3
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x32(%rsi), %xmm4
-	palignr	$2, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x42(%rsi), %xmm5
-	palignr	$2, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x52(%rsi), %xmm6
-	palignr	$2, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x62(%rsi), %xmm7
-	palignr	$2, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x72(%rsi), %xmm8
-	palignr	$2, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x82(%rsi), %xmm9
-	palignr	$2, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_2_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	sub	$0x80, %rdx
-	movaps -0x03(%rsi), %xmm1
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movaps	0x4d(%rsi), %xmm6
-	movaps	0x5d(%rsi), %xmm7
-	movaps	0x6d(%rsi), %xmm8
-	movaps	0x7d(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$3, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$3, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$3, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$3, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$3, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$3, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$3, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_3)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	movaps	-0x03(%rsi), %xmm1
-
-	movaps	-0x13(%rsi), %xmm2
-	palignr	$3, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x23(%rsi), %xmm3
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x33(%rsi), %xmm4
-	palignr	$3, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x43(%rsi), %xmm5
-	palignr	$3, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x53(%rsi), %xmm6
-	palignr	$3, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x63(%rsi), %xmm7
-	palignr	$3, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x73(%rsi), %xmm8
-	palignr	$3, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x83(%rsi), %xmm9
-	palignr	$3, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_3_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	sub	$0x80, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movaps	0x4c(%rsi), %xmm6
-	movaps	0x5c(%rsi), %xmm7
-	movaps	0x6c(%rsi), %xmm8
-	movaps	0x7c(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$4, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$4, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$4, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$4, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$4, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$4, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$4, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_4)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	movaps	-0x04(%rsi), %xmm1
-
-	movaps	-0x14(%rsi), %xmm2
-	palignr	$4, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x24(%rsi), %xmm3
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x34(%rsi), %xmm4
-	palignr	$4, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x44(%rsi), %xmm5
-	palignr	$4, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x54(%rsi), %xmm6
-	palignr	$4, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x64(%rsi), %xmm7
-	palignr	$4, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x74(%rsi), %xmm8
-	palignr	$4, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x84(%rsi), %xmm9
-	palignr	$4, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_4_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	sub	$0x80, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movaps	0x4b(%rsi), %xmm6
-	movaps	0x5b(%rsi), %xmm7
-	movaps	0x6b(%rsi), %xmm8
-	movaps	0x7b(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$5, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$5, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$5, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$5, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$5, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$5, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$5, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_5)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	movaps	-0x05(%rsi), %xmm1
-
-	movaps	-0x15(%rsi), %xmm2
-	palignr	$5, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x25(%rsi), %xmm3
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x35(%rsi), %xmm4
-	palignr	$5, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x45(%rsi), %xmm5
-	palignr	$5, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x55(%rsi), %xmm6
-	palignr	$5, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x65(%rsi), %xmm7
-	palignr	$5, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x75(%rsi), %xmm8
-	palignr	$5, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x85(%rsi), %xmm9
-	palignr	$5, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_5_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	sub	$0x80, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movaps	0x4a(%rsi), %xmm6
-	movaps	0x5a(%rsi), %xmm7
-	movaps	0x6a(%rsi), %xmm8
-	movaps	0x7a(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$6, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$6, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$6, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$6, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$6, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$6, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$6, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_6)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	movaps	-0x06(%rsi), %xmm1
-
-	movaps	-0x16(%rsi), %xmm2
-	palignr	$6, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x26(%rsi), %xmm3
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x36(%rsi), %xmm4
-	palignr	$6, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x46(%rsi), %xmm5
-	palignr	$6, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x56(%rsi), %xmm6
-	palignr	$6, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x66(%rsi), %xmm7
-	palignr	$6, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x76(%rsi), %xmm8
-	palignr	$6, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x86(%rsi), %xmm9
-	palignr	$6, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_6_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	sub	$0x80, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movaps	0x49(%rsi), %xmm6
-	movaps	0x59(%rsi), %xmm7
-	movaps	0x69(%rsi), %xmm8
-	movaps	0x79(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$7, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$7, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$7, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$7, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$7, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$7, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$7, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_7)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	movaps	-0x07(%rsi), %xmm1
-
-	movaps	-0x17(%rsi), %xmm2
-	palignr	$7, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x27(%rsi), %xmm3
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x37(%rsi), %xmm4
-	palignr	$7, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x47(%rsi), %xmm5
-	palignr	$7, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x57(%rsi), %xmm6
-	palignr	$7, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x67(%rsi), %xmm7
-	palignr	$7, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x77(%rsi), %xmm8
-	palignr	$7, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x87(%rsi), %xmm9
-	palignr	$7, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_7_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	sub	$0x80, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movaps	0x48(%rsi), %xmm6
-	movaps	0x58(%rsi), %xmm7
-	movaps	0x68(%rsi), %xmm8
-	movaps	0x78(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$8, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$8, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$8, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$8, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$8, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$8, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$8, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_8)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	movaps	-0x08(%rsi), %xmm1
-
-	movaps	-0x18(%rsi), %xmm2
-	palignr	$8, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x28(%rsi), %xmm3
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x38(%rsi), %xmm4
-	palignr	$8, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x48(%rsi), %xmm5
-	palignr	$8, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x58(%rsi), %xmm6
-	palignr	$8, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x68(%rsi), %xmm7
-	palignr	$8, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x78(%rsi), %xmm8
-	palignr	$8, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x88(%rsi), %xmm9
-	palignr	$8, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_8_bwd)
-L(shl_8_end_bwd):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	sub	$0x80, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movaps	0x47(%rsi), %xmm6
-	movaps	0x57(%rsi), %xmm7
-	movaps	0x67(%rsi), %xmm8
-	movaps	0x77(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$9, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$9, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$9, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$9, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$9, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$9, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$9, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_9)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	movaps	-0x09(%rsi), %xmm1
-
-	movaps	-0x19(%rsi), %xmm2
-	palignr	$9, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x29(%rsi), %xmm3
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x39(%rsi), %xmm4
-	palignr	$9, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x49(%rsi), %xmm5
-	palignr	$9, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x59(%rsi), %xmm6
-	palignr	$9, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x69(%rsi), %xmm7
-	palignr	$9, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x79(%rsi), %xmm8
-	palignr	$9, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x89(%rsi), %xmm9
-	palignr	$9, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_9_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	sub	$0x80, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movaps	0x46(%rsi), %xmm6
-	movaps	0x56(%rsi), %xmm7
-	movaps	0x66(%rsi), %xmm8
-	movaps	0x76(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$10, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$10, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$10, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$10, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$10, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$10, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$10, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_10)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	movaps	-0x0a(%rsi), %xmm1
-
-	movaps	-0x1a(%rsi), %xmm2
-	palignr	$10, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2a(%rsi), %xmm3
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3a(%rsi), %xmm4
-	palignr	$10, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4a(%rsi), %xmm5
-	palignr	$10, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5a(%rsi), %xmm6
-	palignr	$10, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6a(%rsi), %xmm7
-	palignr	$10, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7a(%rsi), %xmm8
-	palignr	$10, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8a(%rsi), %xmm9
-	palignr	$10, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_10_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	sub	$0x80, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movaps	0x45(%rsi), %xmm6
-	movaps	0x55(%rsi), %xmm7
-	movaps	0x65(%rsi), %xmm8
-	movaps	0x75(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$11, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$11, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$11, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$11, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$11, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$11, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$11, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_11)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	movaps	-0x0b(%rsi), %xmm1
-
-	movaps	-0x1b(%rsi), %xmm2
-	palignr	$11, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2b(%rsi), %xmm3
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3b(%rsi), %xmm4
-	palignr	$11, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4b(%rsi), %xmm5
-	palignr	$11, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5b(%rsi), %xmm6
-	palignr	$11, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6b(%rsi), %xmm7
-	palignr	$11, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7b(%rsi), %xmm8
-	palignr	$11, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8b(%rsi), %xmm9
-	palignr	$11, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_11_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	sub	$0x80, %rdx
-	movdqa	-0x0c(%rsi), %xmm1
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movaps	0x44(%rsi), %xmm6
-	movaps	0x54(%rsi), %xmm7
-	movaps	0x64(%rsi), %xmm8
-	movaps	0x74(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$12, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$12, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$12, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$12, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$12, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$12, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$12, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_12)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	movaps	-0x0c(%rsi), %xmm1
-
-	movaps	-0x1c(%rsi), %xmm2
-	palignr	$12, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2c(%rsi), %xmm3
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3c(%rsi), %xmm4
-	palignr	$12, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4c(%rsi), %xmm5
-	palignr	$12, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5c(%rsi), %xmm6
-	palignr	$12, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6c(%rsi), %xmm7
-	palignr	$12, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7c(%rsi), %xmm8
-	palignr	$12, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8c(%rsi), %xmm9
-	palignr	$12, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_12_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	sub	$0x80, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movaps	0x43(%rsi), %xmm6
-	movaps	0x53(%rsi), %xmm7
-	movaps	0x63(%rsi), %xmm8
-	movaps	0x73(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$13, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$13, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$13, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$13, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$13, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$13, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$13, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_13)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	movaps	-0x0d(%rsi), %xmm1
-
-	movaps	-0x1d(%rsi), %xmm2
-	palignr	$13, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2d(%rsi), %xmm3
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3d(%rsi), %xmm4
-	palignr	$13, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4d(%rsi), %xmm5
-	palignr	$13, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5d(%rsi), %xmm6
-	palignr	$13, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6d(%rsi), %xmm7
-	palignr	$13, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7d(%rsi), %xmm8
-	palignr	$13, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8d(%rsi), %xmm9
-	palignr	$13, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_13_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	sub	$0x80, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movaps	0x42(%rsi), %xmm6
-	movaps	0x52(%rsi), %xmm7
-	movaps	0x62(%rsi), %xmm8
-	movaps	0x72(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$14, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$14, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$14, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$14, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$14, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$14, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$14, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_14)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	movaps	-0x0e(%rsi), %xmm1
-
-	movaps	-0x1e(%rsi), %xmm2
-	palignr	$14, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2e(%rsi), %xmm3
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3e(%rsi), %xmm4
-	palignr	$14, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4e(%rsi), %xmm5
-	palignr	$14, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5e(%rsi), %xmm6
-	palignr	$14, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6e(%rsi), %xmm7
-	palignr	$14, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7e(%rsi), %xmm8
-	palignr	$14, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8e(%rsi), %xmm9
-	palignr	$14, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_14_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	sub	$0x80, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movaps	0x41(%rsi), %xmm6
-	movaps	0x51(%rsi), %xmm7
-	movaps	0x61(%rsi), %xmm8
-	movaps	0x71(%rsi), %xmm9
-	lea	0x80(%rsi), %rsi
-	palignr	$15, %xmm8, %xmm9
-	movaps	%xmm9, 0x70(%rdi)
-	palignr	$15, %xmm7, %xmm8
-	movaps	%xmm8, 0x60(%rdi)
-	palignr	$15, %xmm6, %xmm7
-	movaps	%xmm7, 0x50(%rdi)
-	palignr	$15, %xmm5, %xmm6
-	movaps	%xmm6, 0x40(%rdi)
-	palignr	$15, %xmm4, %xmm5
-	movaps	%xmm5, 0x30(%rdi)
-	palignr	$15, %xmm3, %xmm4
-	movaps	%xmm4, 0x20(%rdi)
-	palignr	$15, %xmm2, %xmm3
-	movaps	%xmm3, 0x10(%rdi)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(shl_15)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	movaps	-0x0f(%rsi), %xmm1
-
-	movaps	-0x1f(%rsi), %xmm2
-	palignr	$15, %xmm2, %xmm1
-	movaps	%xmm1, -0x10(%rdi)
-
-	movaps	-0x2f(%rsi), %xmm3
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, -0x20(%rdi)
-
-	movaps	-0x3f(%rsi), %xmm4
-	palignr	$15, %xmm4, %xmm3
-	movaps	%xmm3, -0x30(%rdi)
-
-	movaps	-0x4f(%rsi), %xmm5
-	palignr	$15, %xmm5, %xmm4
-	movaps	%xmm4, -0x40(%rdi)
-
-	movaps	-0x5f(%rsi), %xmm6
-	palignr	$15, %xmm6, %xmm5
-	movaps	%xmm5, -0x50(%rdi)
-
-	movaps	-0x6f(%rsi), %xmm7
-	palignr	$15, %xmm7, %xmm6
-	movaps	%xmm6, -0x60(%rdi)
-
-	movaps	-0x7f(%rsi), %xmm8
-	palignr	$15, %xmm8, %xmm7
-	movaps	%xmm7, -0x70(%rdi)
-
-	movaps	-0x8f(%rsi), %xmm9
-	palignr	$15, %xmm9, %xmm8
-	movaps	%xmm8, -0x80(%rdi)
-
-	sub	$0x80, %rdx
-	lea	-0x80(%rdi), %rdi
-	lea	-0x80(%rsi), %rsi
-	jae	L(shl_15_bwd)
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rdi
-	sub	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_fwd):
-	movdqu	(%rsi), %xmm1
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, (%rdi)
-	sub	$16, %rdx
-	add	$16, %rsi
-	add	$16, %rdi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger_in_fwd)
-	mov	%rdx, %rcx
-L(bigger_in_fwd):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy_fwd)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy_fwd)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy_fwd):
-	sub	$0x80, %rdx
-L(gobble_mem_fwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 0x200(%rsi)
-	prefetcht0 0x300(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lfence
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_mem_fwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_fwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy_fwd):
-	add	%rcx, %rdx
-L(ll_cache_copy_fwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop_fwd):
-	prefetchnta 0x1c0(%rsi)
-	prefetchnta 0x280(%rsi)
-	prefetchnta 0x1c0(%rdi)
-	prefetchnta 0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rsi), %rsi
-	lea	0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop_fwd)
-L(gobble_mem_fwd_end):
-	add	$0x80, %rdx
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
-
-	.p2align 4
-L(gobble_mem_bwd):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-
-	movdqu	-16(%rsi), %xmm0
-	lea	-16(%rdi), %r8
-	mov	%rdi, %r9
-	and	$-16, %rdi
-	sub	%rdi, %r9
-	sub	%r9, %rsi
-	sub	%r9, %rdx
-
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jbe	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-	cmp	%rcx, %rdx
-	ja	L(bigger)
-	mov	%rdx, %rcx
-L(bigger):
-	sub	%rcx, %rdx
-	cmp	$0x1000, %rdx
-	jbe	L(ll_cache_copy)
-
-	mov	%rcx, %r9
-	shl	$3, %r9
-	cmp	%r9, %rdx
-	jbe	L(2steps_copy)
-	add	%rcx, %rdx
-	xor	%rcx, %rcx
-L(2steps_copy):
-	sub	$0x80, %rdx
-L(gobble_mem_bwd_loop):
-	sub	$0x80, %rdx
-	prefetcht0 -0x200(%rsi)
-	prefetcht0 -0x300(%rsi)
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	lfence
-	movntdq	%xmm1, -0x10(%rdi)
-	movntdq	%xmm2, -0x20(%rdi)
-	movntdq	%xmm3, -0x30(%rdi)
-	movntdq	%xmm4, -0x40(%rdi)
-	movntdq	%xmm5, -0x50(%rdi)
-	movntdq	%xmm6, -0x60(%rdi)
-	movntdq	%xmm7, -0x70(%rdi)
-	movntdq	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_mem_bwd_loop)
-	sfence
-	cmp	$0x80, %rcx
-	jb	L(gobble_mem_bwd_end)
-	add	$0x80, %rdx
-L(ll_cache_copy):
-	add	%rcx, %rdx
-L(ll_cache_copy_bwd_start):
-	sub	$0x80, %rdx
-L(gobble_ll_loop):
-	prefetchnta -0x1c0(%rsi)
-	prefetchnta -0x280(%rsi)
-	prefetchnta -0x1c0(%rdi)
-	prefetchnta -0x280(%rdi)
-	sub	$0x80, %rdx
-	movdqu	-0x10(%rsi), %xmm1
-	movdqu	-0x20(%rsi), %xmm2
-	movdqu	-0x30(%rsi), %xmm3
-	movdqu	-0x40(%rsi), %xmm4
-	movdqu	-0x50(%rsi), %xmm5
-	movdqu	-0x60(%rsi), %xmm6
-	movdqu	-0x70(%rsi), %xmm7
-	movdqu	-0x80(%rsi), %xmm8
-	movdqa	%xmm1, -0x10(%rdi)
-	movdqa	%xmm2, -0x20(%rdi)
-	movdqa	%xmm3, -0x30(%rdi)
-	movdqa	%xmm4, -0x40(%rdi)
-	movdqa	%xmm5, -0x50(%rdi)
-	movdqa	%xmm6, -0x60(%rdi)
-	movdqa	%xmm7, -0x70(%rdi)
-	movdqa	%xmm8, -0x80(%rdi)
-	lea	-0x80(%rsi), %rsi
-	lea	-0x80(%rdi), %rdi
-	jae	L(gobble_ll_loop)
-L(gobble_mem_bwd_end):
-	movdqu	%xmm0, (%r8)
-	add	$0x80, %rdx
-	sub	%rdx, %rsi
-	sub	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
-
-	.p2align 4
-L(fwd_write_128bytes):
-	lddqu	-128(%rsi), %xmm0
-	movdqu	%xmm0, -128(%rdi)
-L(fwd_write_112bytes):
-	lddqu	-112(%rsi), %xmm0
-	movdqu	%xmm0, -112(%rdi)
-L(fwd_write_96bytes):
-	lddqu	-96(%rsi), %xmm0
-	movdqu	%xmm0, -96(%rdi)
-L(fwd_write_80bytes):
-	lddqu	-80(%rsi), %xmm0
-	movdqu	%xmm0, -80(%rdi)
-L(fwd_write_64bytes):
-	lddqu	-64(%rsi), %xmm0
-	movdqu	%xmm0, -64(%rdi)
-L(fwd_write_48bytes):
-	lddqu	-48(%rsi), %xmm0
-	movdqu	%xmm0, -48(%rdi)
-L(fwd_write_32bytes):
-	lddqu	-32(%rsi), %xmm0
-	movdqu	%xmm0, -32(%rdi)
-L(fwd_write_16bytes):
-	lddqu	-16(%rsi), %xmm0
-	movdqu	%xmm0, -16(%rdi)
-L(fwd_write_0bytes):
-	ret
-
-
-	.p2align 4
-L(fwd_write_143bytes):
-	lddqu	-143(%rsi), %xmm0
-	movdqu	%xmm0, -143(%rdi)
-L(fwd_write_127bytes):
-	lddqu	-127(%rsi), %xmm0
-	movdqu	%xmm0, -127(%rdi)
-L(fwd_write_111bytes):
-	lddqu	-111(%rsi), %xmm0
-	movdqu	%xmm0, -111(%rdi)
-L(fwd_write_95bytes):
-	lddqu	-95(%rsi), %xmm0
-	movdqu	%xmm0, -95(%rdi)
-L(fwd_write_79bytes):
-	lddqu	-79(%rsi), %xmm0
-	movdqu	%xmm0, -79(%rdi)
-L(fwd_write_63bytes):
-	lddqu	-63(%rsi), %xmm0
-	movdqu	%xmm0, -63(%rdi)
-L(fwd_write_47bytes):
-	lddqu	-47(%rsi), %xmm0
-	movdqu	%xmm0, -47(%rdi)
-L(fwd_write_31bytes):
-	lddqu	-31(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -31(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_15bytes):
-	mov	-15(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -15(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_142bytes):
-	lddqu	-142(%rsi), %xmm0
-	movdqu	%xmm0, -142(%rdi)
-L(fwd_write_126bytes):
-	lddqu	-126(%rsi), %xmm0
-	movdqu	%xmm0, -126(%rdi)
-L(fwd_write_110bytes):
-	lddqu	-110(%rsi), %xmm0
-	movdqu	%xmm0, -110(%rdi)
-L(fwd_write_94bytes):
-	lddqu	-94(%rsi), %xmm0
-	movdqu	%xmm0, -94(%rdi)
-L(fwd_write_78bytes):
-	lddqu	-78(%rsi), %xmm0
-	movdqu	%xmm0, -78(%rdi)
-L(fwd_write_62bytes):
-	lddqu	-62(%rsi), %xmm0
-	movdqu	%xmm0, -62(%rdi)
-L(fwd_write_46bytes):
-	lddqu	-46(%rsi), %xmm0
-	movdqu	%xmm0, -46(%rdi)
-L(fwd_write_30bytes):
-	lddqu	-30(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -30(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_14bytes):
-	mov	-14(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -14(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_141bytes):
-	lddqu	-141(%rsi), %xmm0
-	movdqu	%xmm0, -141(%rdi)
-L(fwd_write_125bytes):
-	lddqu	-125(%rsi), %xmm0
-	movdqu	%xmm0, -125(%rdi)
-L(fwd_write_109bytes):
-	lddqu	-109(%rsi), %xmm0
-	movdqu	%xmm0, -109(%rdi)
-L(fwd_write_93bytes):
-	lddqu	-93(%rsi), %xmm0
-	movdqu	%xmm0, -93(%rdi)
-L(fwd_write_77bytes):
-	lddqu	-77(%rsi), %xmm0
-	movdqu	%xmm0, -77(%rdi)
-L(fwd_write_61bytes):
-	lddqu	-61(%rsi), %xmm0
-	movdqu	%xmm0, -61(%rdi)
-L(fwd_write_45bytes):
-	lddqu	-45(%rsi), %xmm0
-	movdqu	%xmm0, -45(%rdi)
-L(fwd_write_29bytes):
-	lddqu	-29(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -29(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_13bytes):
-	mov	-13(%rsi), %rdx
-	mov	-8(%rsi), %rcx
-	mov	%rdx, -13(%rdi)
-	mov	%rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_140bytes):
-	lddqu	-140(%rsi), %xmm0
-	movdqu	%xmm0, -140(%rdi)
-L(fwd_write_124bytes):
-	lddqu	-124(%rsi), %xmm0
-	movdqu	%xmm0, -124(%rdi)
-L(fwd_write_108bytes):
-	lddqu	-108(%rsi), %xmm0
-	movdqu	%xmm0, -108(%rdi)
-L(fwd_write_92bytes):
-	lddqu	-92(%rsi), %xmm0
-	movdqu	%xmm0, -92(%rdi)
-L(fwd_write_76bytes):
-	lddqu	-76(%rsi), %xmm0
-	movdqu	%xmm0, -76(%rdi)
-L(fwd_write_60bytes):
-	lddqu	-60(%rsi), %xmm0
-	movdqu	%xmm0, -60(%rdi)
-L(fwd_write_44bytes):
-	lddqu	-44(%rsi), %xmm0
-	movdqu	%xmm0, -44(%rdi)
-L(fwd_write_28bytes):
-	lddqu	-28(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -28(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_12bytes):
-	mov	-12(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -12(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_139bytes):
-	lddqu	-139(%rsi), %xmm0
-	movdqu	%xmm0, -139(%rdi)
-L(fwd_write_123bytes):
-	lddqu	-123(%rsi), %xmm0
-	movdqu	%xmm0, -123(%rdi)
-L(fwd_write_107bytes):
-	lddqu	-107(%rsi), %xmm0
-	movdqu	%xmm0, -107(%rdi)
-L(fwd_write_91bytes):
-	lddqu	-91(%rsi), %xmm0
-	movdqu	%xmm0, -91(%rdi)
-L(fwd_write_75bytes):
-	lddqu	-75(%rsi), %xmm0
-	movdqu	%xmm0, -75(%rdi)
-L(fwd_write_59bytes):
-	lddqu	-59(%rsi), %xmm0
-	movdqu	%xmm0, -59(%rdi)
-L(fwd_write_43bytes):
-	lddqu	-43(%rsi), %xmm0
-	movdqu	%xmm0, -43(%rdi)
-L(fwd_write_27bytes):
-	lddqu	-27(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -27(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_11bytes):
-	mov	-11(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -11(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_138bytes):
-	lddqu	-138(%rsi), %xmm0
-	movdqu	%xmm0, -138(%rdi)
-L(fwd_write_122bytes):
-	lddqu	-122(%rsi), %xmm0
-	movdqu	%xmm0, -122(%rdi)
-L(fwd_write_106bytes):
-	lddqu	-106(%rsi), %xmm0
-	movdqu	%xmm0, -106(%rdi)
-L(fwd_write_90bytes):
-	lddqu	-90(%rsi), %xmm0
-	movdqu	%xmm0, -90(%rdi)
-L(fwd_write_74bytes):
-	lddqu	-74(%rsi), %xmm0
-	movdqu	%xmm0, -74(%rdi)
-L(fwd_write_58bytes):
-	lddqu	-58(%rsi), %xmm0
-	movdqu	%xmm0, -58(%rdi)
-L(fwd_write_42bytes):
-	lddqu	-42(%rsi), %xmm0
-	movdqu	%xmm0, -42(%rdi)
-L(fwd_write_26bytes):
-	lddqu	-26(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -26(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_10bytes):
-	mov	-10(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -10(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_137bytes):
-	lddqu	-137(%rsi), %xmm0
-	movdqu	%xmm0, -137(%rdi)
-L(fwd_write_121bytes):
-	lddqu	-121(%rsi), %xmm0
-	movdqu	%xmm0, -121(%rdi)
-L(fwd_write_105bytes):
-	lddqu	-105(%rsi), %xmm0
-	movdqu	%xmm0, -105(%rdi)
-L(fwd_write_89bytes):
-	lddqu	-89(%rsi), %xmm0
-	movdqu	%xmm0, -89(%rdi)
-L(fwd_write_73bytes):
-	lddqu	-73(%rsi), %xmm0
-	movdqu	%xmm0, -73(%rdi)
-L(fwd_write_57bytes):
-	lddqu	-57(%rsi), %xmm0
-	movdqu	%xmm0, -57(%rdi)
-L(fwd_write_41bytes):
-	lddqu	-41(%rsi), %xmm0
-	movdqu	%xmm0, -41(%rdi)
-L(fwd_write_25bytes):
-	lddqu	-25(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -25(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_9bytes):
-	mov	-9(%rsi), %rdx
-	mov	-4(%rsi), %ecx
-	mov	%rdx, -9(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_136bytes):
-	lddqu	-136(%rsi), %xmm0
-	movdqu	%xmm0, -136(%rdi)
-L(fwd_write_120bytes):
-	lddqu	-120(%rsi), %xmm0
-	movdqu	%xmm0, -120(%rdi)
-L(fwd_write_104bytes):
-	lddqu	-104(%rsi), %xmm0
-	movdqu	%xmm0, -104(%rdi)
-L(fwd_write_88bytes):
-	lddqu	-88(%rsi), %xmm0
-	movdqu	%xmm0, -88(%rdi)
-L(fwd_write_72bytes):
-	lddqu	-72(%rsi), %xmm0
-	movdqu	%xmm0, -72(%rdi)
-L(fwd_write_56bytes):
-	lddqu	-56(%rsi), %xmm0
-	movdqu	%xmm0, -56(%rdi)
-L(fwd_write_40bytes):
-	lddqu	-40(%rsi), %xmm0
-	movdqu	%xmm0, -40(%rdi)
-L(fwd_write_24bytes):
-	lddqu	-24(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -24(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	%rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_135bytes):
-	lddqu	-135(%rsi), %xmm0
-	movdqu	%xmm0, -135(%rdi)
-L(fwd_write_119bytes):
-	lddqu	-119(%rsi), %xmm0
-	movdqu	%xmm0, -119(%rdi)
-L(fwd_write_103bytes):
-	lddqu	-103(%rsi), %xmm0
-	movdqu	%xmm0, -103(%rdi)
-L(fwd_write_87bytes):
-	lddqu	-87(%rsi), %xmm0
-	movdqu	%xmm0, -87(%rdi)
-L(fwd_write_71bytes):
-	lddqu	-71(%rsi), %xmm0
-	movdqu	%xmm0, -71(%rdi)
-L(fwd_write_55bytes):
-	lddqu	-55(%rsi), %xmm0
-	movdqu	%xmm0, -55(%rdi)
-L(fwd_write_39bytes):
-	lddqu	-39(%rsi), %xmm0
-	movdqu	%xmm0, -39(%rdi)
-L(fwd_write_23bytes):
-	lddqu	-23(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -23(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -7(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_134bytes):
-	lddqu	-134(%rsi), %xmm0
-	movdqu	%xmm0, -134(%rdi)
-L(fwd_write_118bytes):
-	lddqu	-118(%rsi), %xmm0
-	movdqu	%xmm0, -118(%rdi)
-L(fwd_write_102bytes):
-	lddqu	-102(%rsi), %xmm0
-	movdqu	%xmm0, -102(%rdi)
-L(fwd_write_86bytes):
-	lddqu	-86(%rsi), %xmm0
-	movdqu	%xmm0, -86(%rdi)
-L(fwd_write_70bytes):
-	lddqu	-70(%rsi), %xmm0
-	movdqu	%xmm0, -70(%rdi)
-L(fwd_write_54bytes):
-	lddqu	-54(%rsi), %xmm0
-	movdqu	%xmm0, -54(%rdi)
-L(fwd_write_38bytes):
-	lddqu	-38(%rsi), %xmm0
-	movdqu	%xmm0, -38(%rdi)
-L(fwd_write_22bytes):
-	lddqu	-22(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -22(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -6(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_133bytes):
-	lddqu	-133(%rsi), %xmm0
-	movdqu	%xmm0, -133(%rdi)
-L(fwd_write_117bytes):
-	lddqu	-117(%rsi), %xmm0
-	movdqu	%xmm0, -117(%rdi)
-L(fwd_write_101bytes):
-	lddqu	-101(%rsi), %xmm0
-	movdqu	%xmm0, -101(%rdi)
-L(fwd_write_85bytes):
-	lddqu	-85(%rsi), %xmm0
-	movdqu	%xmm0, -85(%rdi)
-L(fwd_write_69bytes):
-	lddqu	-69(%rsi), %xmm0
-	movdqu	%xmm0, -69(%rdi)
-L(fwd_write_53bytes):
-	lddqu	-53(%rsi), %xmm0
-	movdqu	%xmm0, -53(%rdi)
-L(fwd_write_37bytes):
-	lddqu	-37(%rsi), %xmm0
-	movdqu	%xmm0, -37(%rdi)
-L(fwd_write_21bytes):
-	lddqu	-21(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -21(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	%edx, -5(%rdi)
-	mov	%ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_132bytes):
-	lddqu	-132(%rsi), %xmm0
-	movdqu	%xmm0, -132(%rdi)
-L(fwd_write_116bytes):
-	lddqu	-116(%rsi), %xmm0
-	movdqu	%xmm0, -116(%rdi)
-L(fwd_write_100bytes):
-	lddqu	-100(%rsi), %xmm0
-	movdqu	%xmm0, -100(%rdi)
-L(fwd_write_84bytes):
-	lddqu	-84(%rsi), %xmm0
-	movdqu	%xmm0, -84(%rdi)
-L(fwd_write_68bytes):
-	lddqu	-68(%rsi), %xmm0
-	movdqu	%xmm0, -68(%rdi)
-L(fwd_write_52bytes):
-	lddqu	-52(%rsi), %xmm0
-	movdqu	%xmm0, -52(%rdi)
-L(fwd_write_36bytes):
-	lddqu	-36(%rsi), %xmm0
-	movdqu	%xmm0, -36(%rdi)
-L(fwd_write_20bytes):
-	lddqu	-20(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -20(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	%edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_131bytes):
-	lddqu	-131(%rsi), %xmm0
-	movdqu	%xmm0, -131(%rdi)
-L(fwd_write_115bytes):
-	lddqu	-115(%rsi), %xmm0
-	movdqu	%xmm0, -115(%rdi)
-L(fwd_write_99bytes):
-	lddqu	-99(%rsi), %xmm0
-	movdqu	%xmm0, -99(%rdi)
-L(fwd_write_83bytes):
-	lddqu	-83(%rsi), %xmm0
-	movdqu	%xmm0, -83(%rdi)
-L(fwd_write_67bytes):
-	lddqu	-67(%rsi), %xmm0
-	movdqu	%xmm0, -67(%rdi)
-L(fwd_write_51bytes):
-	lddqu	-51(%rsi), %xmm0
-	movdqu	%xmm0, -51(%rdi)
-L(fwd_write_35bytes):
-	lddqu	-35(%rsi), %xmm0
-	movdqu	%xmm0, -35(%rdi)
-L(fwd_write_19bytes):
-	lddqu	-19(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -19(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	%dx, -3(%rdi)
-	mov	%cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_130bytes):
-	lddqu	-130(%rsi), %xmm0
-	movdqu	%xmm0, -130(%rdi)
-L(fwd_write_114bytes):
-	lddqu	-114(%rsi), %xmm0
-	movdqu	%xmm0, -114(%rdi)
-L(fwd_write_98bytes):
-	lddqu	-98(%rsi), %xmm0
-	movdqu	%xmm0, -98(%rdi)
-L(fwd_write_82bytes):
-	lddqu	-82(%rsi), %xmm0
-	movdqu	%xmm0, -82(%rdi)
-L(fwd_write_66bytes):
-	lddqu	-66(%rsi), %xmm0
-	movdqu	%xmm0, -66(%rdi)
-L(fwd_write_50bytes):
-	lddqu	-50(%rsi), %xmm0
-	movdqu	%xmm0, -50(%rdi)
-L(fwd_write_34bytes):
-	lddqu	-34(%rsi), %xmm0
-	movdqu	%xmm0, -34(%rdi)
-L(fwd_write_18bytes):
-	lddqu	-18(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -18(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_2bytes):
-	movzwl	-2(%rsi), %edx
-	mov	%dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_129bytes):
-	lddqu	-129(%rsi), %xmm0
-	movdqu	%xmm0, -129(%rdi)
-L(fwd_write_113bytes):
-	lddqu	-113(%rsi), %xmm0
-	movdqu	%xmm0, -113(%rdi)
-L(fwd_write_97bytes):
-	lddqu	-97(%rsi), %xmm0
-	movdqu	%xmm0, -97(%rdi)
-L(fwd_write_81bytes):
-	lddqu	-81(%rsi), %xmm0
-	movdqu	%xmm0, -81(%rdi)
-L(fwd_write_65bytes):
-	lddqu	-65(%rsi), %xmm0
-	movdqu	%xmm0, -65(%rdi)
-L(fwd_write_49bytes):
-	lddqu	-49(%rsi), %xmm0
-	movdqu	%xmm0, -49(%rdi)
-L(fwd_write_33bytes):
-	lddqu	-33(%rsi), %xmm0
-	movdqu	%xmm0, -33(%rdi)
-L(fwd_write_17bytes):
-	lddqu	-17(%rsi), %xmm0
-	lddqu	-16(%rsi), %xmm1
-	movdqu	%xmm0, -17(%rdi)
-	movdqu	%xmm1, -16(%rdi)
-	ret
-
-	.p2align 4
-L(fwd_write_1bytes):
-	movzbl	-1(%rsi), %edx
-	mov	%dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_128bytes):
-	lddqu	112(%rsi), %xmm0
-	movdqu	%xmm0, 112(%rdi)
-L(bwd_write_112bytes):
-	lddqu	96(%rsi), %xmm0
-	movdqu	%xmm0, 96(%rdi)
-L(bwd_write_96bytes):
-	lddqu	80(%rsi), %xmm0
-	movdqu	%xmm0, 80(%rdi)
-L(bwd_write_80bytes):
-	lddqu	64(%rsi), %xmm0
-	movdqu	%xmm0, 64(%rdi)
-L(bwd_write_64bytes):
-	lddqu	48(%rsi), %xmm0
-	movdqu	%xmm0, 48(%rdi)
-L(bwd_write_48bytes):
-	lddqu	32(%rsi), %xmm0
-	movdqu	%xmm0, 32(%rdi)
-L(bwd_write_32bytes):
-	lddqu	16(%rsi), %xmm0
-	movdqu	%xmm0, 16(%rdi)
-L(bwd_write_16bytes):
-	lddqu	(%rsi), %xmm0
-	movdqu	%xmm0, (%rdi)
-L(bwd_write_0bytes):
-	ret
-
-	.p2align 4
-L(bwd_write_143bytes):
-	lddqu	127(%rsi), %xmm0
-	movdqu	%xmm0, 127(%rdi)
-L(bwd_write_127bytes):
-	lddqu	111(%rsi), %xmm0
-	movdqu	%xmm0, 111(%rdi)
-L(bwd_write_111bytes):
-	lddqu	95(%rsi), %xmm0
-	movdqu	%xmm0, 95(%rdi)
-L(bwd_write_95bytes):
-	lddqu	79(%rsi), %xmm0
-	movdqu	%xmm0, 79(%rdi)
-L(bwd_write_79bytes):
-	lddqu	63(%rsi), %xmm0
-	movdqu	%xmm0, 63(%rdi)
-L(bwd_write_63bytes):
-	lddqu	47(%rsi), %xmm0
-	movdqu	%xmm0, 47(%rdi)
-L(bwd_write_47bytes):
-	lddqu	31(%rsi), %xmm0
-	movdqu	%xmm0, 31(%rdi)
-L(bwd_write_31bytes):
-	lddqu	15(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 15(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-
-	.p2align 4
-L(bwd_write_15bytes):
-	mov	7(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 7(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_142bytes):
-	lddqu	126(%rsi), %xmm0
-	movdqu	%xmm0, 126(%rdi)
-L(bwd_write_126bytes):
-	lddqu	110(%rsi), %xmm0
-	movdqu	%xmm0, 110(%rdi)
-L(bwd_write_110bytes):
-	lddqu	94(%rsi), %xmm0
-	movdqu	%xmm0, 94(%rdi)
-L(bwd_write_94bytes):
-	lddqu	78(%rsi), %xmm0
-	movdqu	%xmm0, 78(%rdi)
-L(bwd_write_78bytes):
-	lddqu	62(%rsi), %xmm0
-	movdqu	%xmm0, 62(%rdi)
-L(bwd_write_62bytes):
-	lddqu	46(%rsi), %xmm0
-	movdqu	%xmm0, 46(%rdi)
-L(bwd_write_46bytes):
-	lddqu	30(%rsi), %xmm0
-	movdqu	%xmm0, 30(%rdi)
-L(bwd_write_30bytes):
-	lddqu	14(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 14(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_14bytes):
-	mov	6(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 6(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_141bytes):
-	lddqu	125(%rsi), %xmm0
-	movdqu	%xmm0, 125(%rdi)
-L(bwd_write_125bytes):
-	lddqu	109(%rsi), %xmm0
-	movdqu	%xmm0, 109(%rdi)
-L(bwd_write_109bytes):
-	lddqu	93(%rsi), %xmm0
-	movdqu	%xmm0, 93(%rdi)
-L(bwd_write_93bytes):
-	lddqu	77(%rsi), %xmm0
-	movdqu	%xmm0, 77(%rdi)
-L(bwd_write_77bytes):
-	lddqu	61(%rsi), %xmm0
-	movdqu	%xmm0, 61(%rdi)
-L(bwd_write_61bytes):
-	lddqu	45(%rsi), %xmm0
-	movdqu	%xmm0, 45(%rdi)
-L(bwd_write_45bytes):
-	lddqu	29(%rsi), %xmm0
-	movdqu	%xmm0, 29(%rdi)
-L(bwd_write_29bytes):
-	lddqu	13(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 13(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_13bytes):
-	mov	5(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 5(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_140bytes):
-	lddqu	124(%rsi), %xmm0
-	movdqu	%xmm0, 124(%rdi)
-L(bwd_write_124bytes):
-	lddqu	108(%rsi), %xmm0
-	movdqu	%xmm0, 108(%rdi)
-L(bwd_write_108bytes):
-	lddqu	92(%rsi), %xmm0
-	movdqu	%xmm0, 92(%rdi)
-L(bwd_write_92bytes):
-	lddqu	76(%rsi), %xmm0
-	movdqu	%xmm0, 76(%rdi)
-L(bwd_write_76bytes):
-	lddqu	60(%rsi), %xmm0
-	movdqu	%xmm0, 60(%rdi)
-L(bwd_write_60bytes):
-	lddqu	44(%rsi), %xmm0
-	movdqu	%xmm0, 44(%rdi)
-L(bwd_write_44bytes):
-	lddqu	28(%rsi), %xmm0
-	movdqu	%xmm0, 28(%rdi)
-L(bwd_write_28bytes):
-	lddqu	12(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 12(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_12bytes):
-	mov	4(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 4(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_139bytes):
-	lddqu	123(%rsi), %xmm0
-	movdqu	%xmm0, 123(%rdi)
-L(bwd_write_123bytes):
-	lddqu	107(%rsi), %xmm0
-	movdqu	%xmm0, 107(%rdi)
-L(bwd_write_107bytes):
-	lddqu	91(%rsi), %xmm0
-	movdqu	%xmm0, 91(%rdi)
-L(bwd_write_91bytes):
-	lddqu	75(%rsi), %xmm0
-	movdqu	%xmm0, 75(%rdi)
-L(bwd_write_75bytes):
-	lddqu	59(%rsi), %xmm0
-	movdqu	%xmm0, 59(%rdi)
-L(bwd_write_59bytes):
-	lddqu	43(%rsi), %xmm0
-	movdqu	%xmm0, 43(%rdi)
-L(bwd_write_43bytes):
-	lddqu	27(%rsi), %xmm0
-	movdqu	%xmm0, 27(%rdi)
-L(bwd_write_27bytes):
-	lddqu	11(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 11(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_11bytes):
-	mov	3(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 3(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_138bytes):
-	lddqu	122(%rsi), %xmm0
-	movdqu	%xmm0, 122(%rdi)
-L(bwd_write_122bytes):
-	lddqu	106(%rsi), %xmm0
-	movdqu	%xmm0, 106(%rdi)
-L(bwd_write_106bytes):
-	lddqu	90(%rsi), %xmm0
-	movdqu	%xmm0, 90(%rdi)
-L(bwd_write_90bytes):
-	lddqu	74(%rsi), %xmm0
-	movdqu	%xmm0, 74(%rdi)
-L(bwd_write_74bytes):
-	lddqu	58(%rsi), %xmm0
-	movdqu	%xmm0, 58(%rdi)
-L(bwd_write_58bytes):
-	lddqu	42(%rsi), %xmm0
-	movdqu	%xmm0, 42(%rdi)
-L(bwd_write_42bytes):
-	lddqu	26(%rsi), %xmm0
-	movdqu	%xmm0, 26(%rdi)
-L(bwd_write_26bytes):
-	lddqu	10(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 10(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_10bytes):
-	mov	2(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 2(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_137bytes):
-	lddqu	121(%rsi), %xmm0
-	movdqu	%xmm0, 121(%rdi)
-L(bwd_write_121bytes):
-	lddqu	105(%rsi), %xmm0
-	movdqu	%xmm0, 105(%rdi)
-L(bwd_write_105bytes):
-	lddqu	89(%rsi), %xmm0
-	movdqu	%xmm0, 89(%rdi)
-L(bwd_write_89bytes):
-	lddqu	73(%rsi), %xmm0
-	movdqu	%xmm0, 73(%rdi)
-L(bwd_write_73bytes):
-	lddqu	57(%rsi), %xmm0
-	movdqu	%xmm0, 57(%rdi)
-L(bwd_write_57bytes):
-	lddqu	41(%rsi), %xmm0
-	movdqu	%xmm0, 41(%rdi)
-L(bwd_write_41bytes):
-	lddqu	25(%rsi), %xmm0
-	movdqu	%xmm0, 25(%rdi)
-L(bwd_write_25bytes):
-	lddqu	9(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 9(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_9bytes):
-	mov	1(%rsi), %rdx
-	mov	(%rsi), %rcx
-	mov	%rdx, 1(%rdi)
-	mov	%rcx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_136bytes):
-	lddqu	120(%rsi), %xmm0
-	movdqu	%xmm0, 120(%rdi)
-L(bwd_write_120bytes):
-	lddqu	104(%rsi), %xmm0
-	movdqu	%xmm0, 104(%rdi)
-L(bwd_write_104bytes):
-	lddqu	88(%rsi), %xmm0
-	movdqu	%xmm0, 88(%rdi)
-L(bwd_write_88bytes):
-	lddqu	72(%rsi), %xmm0
-	movdqu	%xmm0, 72(%rdi)
-L(bwd_write_72bytes):
-	lddqu	56(%rsi), %xmm0
-	movdqu	%xmm0, 56(%rdi)
-L(bwd_write_56bytes):
-	lddqu	40(%rsi), %xmm0
-	movdqu	%xmm0, 40(%rdi)
-L(bwd_write_40bytes):
-	lddqu	24(%rsi), %xmm0
-	movdqu	%xmm0, 24(%rdi)
-L(bwd_write_24bytes):
-	lddqu	8(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 8(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_8bytes):
-	mov	(%rsi), %rdx
-	mov	%rdx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_135bytes):
-	lddqu	119(%rsi), %xmm0
-	movdqu	%xmm0, 119(%rdi)
-L(bwd_write_119bytes):
-	lddqu	103(%rsi), %xmm0
-	movdqu	%xmm0, 103(%rdi)
-L(bwd_write_103bytes):
-	lddqu	87(%rsi), %xmm0
-	movdqu	%xmm0, 87(%rdi)
-L(bwd_write_87bytes):
-	lddqu	71(%rsi), %xmm0
-	movdqu	%xmm0, 71(%rdi)
-L(bwd_write_71bytes):
-	lddqu	55(%rsi), %xmm0
-	movdqu	%xmm0, 55(%rdi)
-L(bwd_write_55bytes):
-	lddqu	39(%rsi), %xmm0
-	movdqu	%xmm0, 39(%rdi)
-L(bwd_write_39bytes):
-	lddqu	23(%rsi), %xmm0
-	movdqu	%xmm0, 23(%rdi)
-L(bwd_write_23bytes):
-	lddqu	7(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 7(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_7bytes):
-	mov	3(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 3(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_134bytes):
-	lddqu	118(%rsi), %xmm0
-	movdqu	%xmm0, 118(%rdi)
-L(bwd_write_118bytes):
-	lddqu	102(%rsi), %xmm0
-	movdqu	%xmm0, 102(%rdi)
-L(bwd_write_102bytes):
-	lddqu	86(%rsi), %xmm0
-	movdqu	%xmm0, 86(%rdi)
-L(bwd_write_86bytes):
-	lddqu	70(%rsi), %xmm0
-	movdqu	%xmm0, 70(%rdi)
-L(bwd_write_70bytes):
-	lddqu	54(%rsi), %xmm0
-	movdqu	%xmm0, 54(%rdi)
-L(bwd_write_54bytes):
-	lddqu	38(%rsi), %xmm0
-	movdqu	%xmm0, 38(%rdi)
-L(bwd_write_38bytes):
-	lddqu	22(%rsi), %xmm0
-	movdqu	%xmm0, 22(%rdi)
-L(bwd_write_22bytes):
-	lddqu	6(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 6(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_6bytes):
-	mov	2(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 2(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_133bytes):
-	lddqu	117(%rsi), %xmm0
-	movdqu	%xmm0, 117(%rdi)
-L(bwd_write_117bytes):
-	lddqu	101(%rsi), %xmm0
-	movdqu	%xmm0, 101(%rdi)
-L(bwd_write_101bytes):
-	lddqu	85(%rsi), %xmm0
-	movdqu	%xmm0, 85(%rdi)
-L(bwd_write_85bytes):
-	lddqu	69(%rsi), %xmm0
-	movdqu	%xmm0, 69(%rdi)
-L(bwd_write_69bytes):
-	lddqu	53(%rsi), %xmm0
-	movdqu	%xmm0, 53(%rdi)
-L(bwd_write_53bytes):
-	lddqu	37(%rsi), %xmm0
-	movdqu	%xmm0, 37(%rdi)
-L(bwd_write_37bytes):
-	lddqu	21(%rsi), %xmm0
-	movdqu	%xmm0, 21(%rdi)
-L(bwd_write_21bytes):
-	lddqu	5(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 5(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_5bytes):
-	mov	1(%rsi), %edx
-	mov	(%rsi), %ecx
-	mov	%edx, 1(%rdi)
-	mov	%ecx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_132bytes):
-	lddqu	116(%rsi), %xmm0
-	movdqu	%xmm0, 116(%rdi)
-L(bwd_write_116bytes):
-	lddqu	100(%rsi), %xmm0
-	movdqu	%xmm0, 100(%rdi)
-L(bwd_write_100bytes):
-	lddqu	84(%rsi), %xmm0
-	movdqu	%xmm0, 84(%rdi)
-L(bwd_write_84bytes):
-	lddqu	68(%rsi), %xmm0
-	movdqu	%xmm0, 68(%rdi)
-L(bwd_write_68bytes):
-	lddqu	52(%rsi), %xmm0
-	movdqu	%xmm0, 52(%rdi)
-L(bwd_write_52bytes):
-	lddqu	36(%rsi), %xmm0
-	movdqu	%xmm0, 36(%rdi)
-L(bwd_write_36bytes):
-	lddqu	20(%rsi), %xmm0
-	movdqu	%xmm0, 20(%rdi)
-L(bwd_write_20bytes):
-	lddqu	4(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 4(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_4bytes):
-	mov	(%rsi), %edx
-	mov	%edx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_131bytes):
-	lddqu	115(%rsi), %xmm0
-	movdqu	%xmm0, 115(%rdi)
-L(bwd_write_115bytes):
-	lddqu	99(%rsi), %xmm0
-	movdqu	%xmm0, 99(%rdi)
-L(bwd_write_99bytes):
-	lddqu	83(%rsi), %xmm0
-	movdqu	%xmm0, 83(%rdi)
-L(bwd_write_83bytes):
-	lddqu	67(%rsi), %xmm0
-	movdqu	%xmm0, 67(%rdi)
-L(bwd_write_67bytes):
-	lddqu	51(%rsi), %xmm0
-	movdqu	%xmm0, 51(%rdi)
-L(bwd_write_51bytes):
-	lddqu	35(%rsi), %xmm0
-	movdqu	%xmm0, 35(%rdi)
-L(bwd_write_35bytes):
-	lddqu	19(%rsi), %xmm0
-	movdqu	%xmm0, 19(%rdi)
-L(bwd_write_19bytes):
-	lddqu	3(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 3(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_3bytes):
-	mov	1(%rsi), %dx
-	mov	(%rsi), %cx
-	mov	%dx, 1(%rdi)
-	mov	%cx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_130bytes):
-	lddqu	114(%rsi), %xmm0
-	movdqu	%xmm0, 114(%rdi)
-L(bwd_write_114bytes):
-	lddqu	98(%rsi), %xmm0
-	movdqu	%xmm0, 98(%rdi)
-L(bwd_write_98bytes):
-	lddqu	82(%rsi), %xmm0
-	movdqu	%xmm0, 82(%rdi)
-L(bwd_write_82bytes):
-	lddqu	66(%rsi), %xmm0
-	movdqu	%xmm0, 66(%rdi)
-L(bwd_write_66bytes):
-	lddqu	50(%rsi), %xmm0
-	movdqu	%xmm0, 50(%rdi)
-L(bwd_write_50bytes):
-	lddqu	34(%rsi), %xmm0
-	movdqu	%xmm0, 34(%rdi)
-L(bwd_write_34bytes):
-	lddqu	18(%rsi), %xmm0
-	movdqu	%xmm0, 18(%rdi)
-L(bwd_write_18bytes):
-	lddqu	2(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 2(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_2bytes):
-	movzwl	(%rsi), %edx
-	mov	%dx, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_129bytes):
-	lddqu	113(%rsi), %xmm0
-	movdqu	%xmm0, 113(%rdi)
-L(bwd_write_113bytes):
-	lddqu	97(%rsi), %xmm0
-	movdqu	%xmm0, 97(%rdi)
-L(bwd_write_97bytes):
-	lddqu	81(%rsi), %xmm0
-	movdqu	%xmm0, 81(%rdi)
-L(bwd_write_81bytes):
-	lddqu	65(%rsi), %xmm0
-	movdqu	%xmm0, 65(%rdi)
-L(bwd_write_65bytes):
-	lddqu	49(%rsi), %xmm0
-	movdqu	%xmm0, 49(%rdi)
-L(bwd_write_49bytes):
-	lddqu	33(%rsi), %xmm0
-	movdqu	%xmm0, 33(%rdi)
-L(bwd_write_33bytes):
-	lddqu	17(%rsi), %xmm0
-	movdqu	%xmm0, 17(%rdi)
-L(bwd_write_17bytes):
-	lddqu	1(%rsi), %xmm0
-	lddqu	(%rsi), %xmm1
-	movdqu	%xmm0, 1(%rdi)
-	movdqu	%xmm1, (%rdi)
-	ret
-
-	.p2align 4
-L(bwd_write_1bytes):
-	movzbl	(%rsi), %edx
-	mov	%dl, (%rdi)
-	ret
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_144_bytes_bwd):
-	.int	JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
-	.int	JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
-
-	.p2align 3
-L(table_144_bytes_fwd):
-	.int	JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
-	.int	JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
-
-	.p2align 3
-L(shl_table_fwd):
-	.int	JMPTBL (L(shl_0), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_1), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_2), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_3), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_4), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_5), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_6), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_7), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_8), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_9), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_10), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_11), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_12), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_13), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_14), L(shl_table_fwd))
-	.int	JMPTBL (L(shl_15), L(shl_table_fwd))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
deleted file mode 100644
index f9a4e9aff9..0000000000
--- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3_back
-#define MEMCPY_CHK	__memmove_chk_ssse3_back
-#include "memcpy-ssse3-back.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back
  2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 19:56   ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:56 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile             |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   15 -
>  sysdeps/x86_64/multiarch/ifunc-memmove.h      |    7 -
>  sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 3181 -----------------
>  sysdeps/x86_64/multiarch/memmove-ssse3-back.S |    4 -
>  5 files changed, 3209 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
>  delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3-back.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 48f81711ae..323be3b969 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -16,14 +16,12 @@ sysdep_routines += \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
>    memcmpeq-sse2 \
> -  memcpy-ssse3-back \
>    memmove-avx-unaligned-erms \
>    memmove-avx-unaligned-erms-rtm \
>    memmove-avx512-no-vzeroupper \
>    memmove-avx512-unaligned-erms \
>    memmove-evex-unaligned-erms \
>    memmove-sse2-unaligned-erms \
> -  memmove-ssse3-back \
>    memrchr-avx2 \
>    memrchr-avx2-rtm \
>    memrchr-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 70b0e9c62e..d6852ab365 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -132,9 +132,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memmove_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memmove_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
>                               __memmove_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memmove_chk, 1,
> @@ -174,8 +171,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memmove,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memmove_avx512_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
> -                             __memmove_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_erms)
>               IFUNC_IMPL_ADD (array, i, memmove, 1,
>                               __memmove_sse2_unaligned)
> @@ -879,9 +874,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __memcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
>                               __memcpy_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __memcpy_chk, 1,
> @@ -912,8 +904,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, memcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __memcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, memcpy,
>                               CPU_FEATURE_USABLE (AVX512F),
>                               __memcpy_avx512_no_vzeroupper)
> @@ -960,9 +950,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_chk_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
> -                             CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_chk_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
>                               __mempcpy_chk_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, __mempcpy_chk, 1,
> @@ -1002,8 +989,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>               IFUNC_IMPL_ADD (array, i, mempcpy,
>                               CPU_FEATURE_USABLE (AVX512VL),
>                               __mempcpy_evex_unaligned_erms)
> -             IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __mempcpy_ssse3_back)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
>                               __mempcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, mempcpy, 1,
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> index 1ecdd4b0d3..5596ddea2c 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
> @@ -92,13 +92,6 @@ IFUNC_SELECTOR (void)
>         }
>      }
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
> -      && !CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Copy))
> -    {
> -      if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Copy_Backward))
> -        return OPTIMIZE (ssse3_back);
> -    }
> -
>    if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
>         return OPTIMIZE (sse2_unaligned_erms);
>
> diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> deleted file mode 100644
> index 92cfbf7933..0000000000
> --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
> +++ /dev/null
> @@ -1,3181 +0,0 @@
> -/* memcpy with SSSE3 and REP string
> -   Copyright (C) 2010-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#include <sysdep.h>
> -
> -#if IS_IN (libc)
> -
> -#include "asm-syntax.h"
> -
> -#ifndef MEMCPY
> -# define MEMCPY                __memcpy_ssse3_back
> -# define MEMCPY_CHK    __memcpy_chk_ssse3_back
> -# define MEMPCPY       __mempcpy_ssse3_back
> -# define MEMPCPY_CHK   __mempcpy_chk_ssse3_back
> -#endif
> -
> -#define JMPTBL(I, B)   I - B
> -
> -/* Branch to an entry in a jump table.  TABLE is a jump table with
> -   relative offsets.  INDEX is a register contains the index into the
> -   jump table.  SCALE is the scale of INDEX.  */
> -#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)            \
> -  lea          TABLE(%rip), %r11;                              \
> -  movslq       (%r11, INDEX, SCALE), INDEX;                    \
> -  lea          (%r11, INDEX), INDEX;                           \
> -  _CET_NOTRACK jmp *INDEX;                                     \
> -  ud2
> -
> -       .section .text.ssse3,"ax",@progbits
> -#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
> -ENTRY (MEMPCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMPCPY_CHK)
> -
> -ENTRY (MEMPCPY)
> -       mov     %RDI_LP, %RAX_LP
> -       add     %RDX_LP, %RAX_LP
> -       jmp     L(start)
> -END (MEMPCPY)
> -#endif
> -
> -#if !defined USE_AS_BCOPY
> -ENTRY (MEMCPY_CHK)
> -       cmp     %RDX_LP, %RCX_LP
> -       jb      HIDDEN_JUMPTARGET (__chk_fail)
> -END (MEMCPY_CHK)
> -#endif
> -
> -ENTRY (MEMCPY)
> -       mov     %RDI_LP, %RAX_LP
> -#ifdef USE_AS_MEMPCPY
> -       add     %RDX_LP, %RAX_LP
> -#endif
> -
> -#ifdef __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -#endif
> -
> -#ifdef USE_AS_MEMMOVE
> -       cmp     %rsi, %rdi
> -       jb      L(copy_forward)
> -       je      L(bwd_write_0bytes)
> -       cmp     $144, %rdx
> -       jae     L(copy_backward)
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -L(copy_forward):
> -#endif
> -L(start):
> -       cmp     $144, %rdx
> -       jae     L(144bytesormore)
> -
> -L(fwd_write_less32bytes):
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jbe     L(bk_write)
> -#endif
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -#ifndef USE_AS_MEMMOVE
> -L(bk_write):
> -
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -#endif
> -
> -       .p2align 4
> -L(144bytesormore):
> -
> -#ifndef USE_AS_MEMMOVE
> -       cmp     %dil, %sil
> -       jle     L(copy_backward)
> -#endif
> -       movdqu  (%rsi), %xmm0
> -       mov     %rdi, %r8
> -       and     $-16, %rdi
> -       add     $16, %rdi
> -       mov     %rdi, %r9
> -       sub     %r8, %r9
> -       sub     %r9, %rdx
> -       add     %r9, %rsi
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0)
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       cmp     %rcx, %rdx
> -       jae     L(gobble_mem_fwd)
> -       lea     L(shl_table_fwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(copy_backward):
> -#ifdef DATA_CACHE_SIZE
> -       mov     $DATA_CACHE_SIZE, %RCX_LP
> -#else
> -       mov     __x86_data_cache_size(%rip), %RCX_LP
> -#endif
> -       shl     $1, %rcx
> -       cmp     %rcx, %rdx
> -       ja      L(gobble_mem_bwd)
> -
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $0xf, %r9
> -       xor     %r9, %rdi
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -       mov     %rsi, %r9
> -       and     $0xf, %r9
> -       jz      L(shl_0_bwd)
> -       lea     L(shl_table_bwd)(%rip), %r11
> -       sub     $0x80, %rdx
> -       movslq  (%r11, %r9, 4), %r9
> -       add     %r11, %r9
> -       _CET_NOTRACK jmp *%r9
> -       ud2
> -
> -       .p2align 4
> -L(shl_0):
> -
> -       mov     %rdx, %r9
> -       shr     $8, %r9
> -       add     %rdx, %r9
> -#ifdef DATA_CACHE_SIZE
> -       cmp     $DATA_CACHE_SIZE_HALF, %R9_LP
> -#else
> -       cmp     __x86_data_cache_size_half(%rip), %R9_LP
> -#endif
> -       jae     L(gobble_mem_fwd)
> -       sub     $0x80, %rdx
> -       .p2align 4
> -L(shl_0_loop):
> -       movdqa  (%rsi), %xmm1
> -       movdqa  %xmm1, (%rdi)
> -       movaps  0x10(%rsi), %xmm2
> -       movaps  %xmm2, 0x10(%rdi)
> -       movaps  0x20(%rsi), %xmm3
> -       movaps  %xmm3, 0x20(%rdi)
> -       movaps  0x30(%rsi), %xmm4
> -       movaps  %xmm4, 0x30(%rdi)
> -       movaps  0x40(%rsi), %xmm1
> -       movaps  %xmm1, 0x40(%rdi)
> -       movaps  0x50(%rsi), %xmm2
> -       movaps  %xmm2, 0x50(%rdi)
> -       movaps  0x60(%rsi), %xmm3
> -       movaps  %xmm3, 0x60(%rdi)
> -       movaps  0x70(%rsi), %xmm4
> -       movaps  %xmm4, 0x70(%rdi)
> -       sub     $0x80, %rdx
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_0_loop)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_0_bwd):
> -       sub     $0x80, %rdx
> -L(copy_backward_loop):
> -       movaps  -0x10(%rsi), %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -       movaps  -0x20(%rsi), %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -       movaps  -0x30(%rsi), %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -       movaps  -0x40(%rsi), %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -       movaps  -0x50(%rsi), %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -       movaps  -0x60(%rsi), %xmm5
> -       movaps  %xmm5, -0x60(%rdi)
> -       movaps  -0x70(%rsi), %xmm5
> -       movaps  %xmm5, -0x70(%rdi)
> -       movaps  -0x80(%rsi), %xmm5
> -       movaps  %xmm5, -0x80(%rdi)
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(copy_backward_loop)
> -
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1):
> -       sub     $0x80, %rdx
> -       movaps  -0x01(%rsi), %xmm1
> -       movaps  0x0f(%rsi), %xmm2
> -       movaps  0x1f(%rsi), %xmm3
> -       movaps  0x2f(%rsi), %xmm4
> -       movaps  0x3f(%rsi), %xmm5
> -       movaps  0x4f(%rsi), %xmm6
> -       movaps  0x5f(%rsi), %xmm7
> -       movaps  0x6f(%rsi), %xmm8
> -       movaps  0x7f(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $1, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $1, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $1, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $1, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $1, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $1, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $1, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_1)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_1_bwd):
> -       movaps  -0x01(%rsi), %xmm1
> -
> -       movaps  -0x11(%rsi), %xmm2
> -       palignr $1, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x21(%rsi), %xmm3
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x31(%rsi), %xmm4
> -       palignr $1, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x41(%rsi), %xmm5
> -       palignr $1, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x51(%rsi), %xmm6
> -       palignr $1, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x61(%rsi), %xmm7
> -       palignr $1, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x71(%rsi), %xmm8
> -       palignr $1, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x81(%rsi), %xmm9
> -       palignr $1, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_1_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2):
> -       sub     $0x80, %rdx
> -       movaps  -0x02(%rsi), %xmm1
> -       movaps  0x0e(%rsi), %xmm2
> -       movaps  0x1e(%rsi), %xmm3
> -       movaps  0x2e(%rsi), %xmm4
> -       movaps  0x3e(%rsi), %xmm5
> -       movaps  0x4e(%rsi), %xmm6
> -       movaps  0x5e(%rsi), %xmm7
> -       movaps  0x6e(%rsi), %xmm8
> -       movaps  0x7e(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $2, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $2, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $2, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $2, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $2, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $2, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $2, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_2)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_2_bwd):
> -       movaps  -0x02(%rsi), %xmm1
> -
> -       movaps  -0x12(%rsi), %xmm2
> -       palignr $2, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x22(%rsi), %xmm3
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x32(%rsi), %xmm4
> -       palignr $2, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x42(%rsi), %xmm5
> -       palignr $2, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x52(%rsi), %xmm6
> -       palignr $2, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x62(%rsi), %xmm7
> -       palignr $2, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x72(%rsi), %xmm8
> -       palignr $2, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x82(%rsi), %xmm9
> -       palignr $2, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_2_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3):
> -       sub     $0x80, %rdx
> -       movaps -0x03(%rsi), %xmm1
> -       movaps  0x0d(%rsi), %xmm2
> -       movaps  0x1d(%rsi), %xmm3
> -       movaps  0x2d(%rsi), %xmm4
> -       movaps  0x3d(%rsi), %xmm5
> -       movaps  0x4d(%rsi), %xmm6
> -       movaps  0x5d(%rsi), %xmm7
> -       movaps  0x6d(%rsi), %xmm8
> -       movaps  0x7d(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $3, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $3, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $3, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $3, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $3, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $3, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $3, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_3)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_3_bwd):
> -       movaps  -0x03(%rsi), %xmm1
> -
> -       movaps  -0x13(%rsi), %xmm2
> -       palignr $3, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x23(%rsi), %xmm3
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x33(%rsi), %xmm4
> -       palignr $3, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x43(%rsi), %xmm5
> -       palignr $3, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x53(%rsi), %xmm6
> -       palignr $3, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x63(%rsi), %xmm7
> -       palignr $3, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x73(%rsi), %xmm8
> -       palignr $3, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x83(%rsi), %xmm9
> -       palignr $3, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_3_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4):
> -       sub     $0x80, %rdx
> -       movaps  -0x04(%rsi), %xmm1
> -       movaps  0x0c(%rsi), %xmm2
> -       movaps  0x1c(%rsi), %xmm3
> -       movaps  0x2c(%rsi), %xmm4
> -       movaps  0x3c(%rsi), %xmm5
> -       movaps  0x4c(%rsi), %xmm6
> -       movaps  0x5c(%rsi), %xmm7
> -       movaps  0x6c(%rsi), %xmm8
> -       movaps  0x7c(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $4, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $4, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $4, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $4, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $4, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $4, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $4, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_4)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_4_bwd):
> -       movaps  -0x04(%rsi), %xmm1
> -
> -       movaps  -0x14(%rsi), %xmm2
> -       palignr $4, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x24(%rsi), %xmm3
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x34(%rsi), %xmm4
> -       palignr $4, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x44(%rsi), %xmm5
> -       palignr $4, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x54(%rsi), %xmm6
> -       palignr $4, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x64(%rsi), %xmm7
> -       palignr $4, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x74(%rsi), %xmm8
> -       palignr $4, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x84(%rsi), %xmm9
> -       palignr $4, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_4_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5):
> -       sub     $0x80, %rdx
> -       movaps  -0x05(%rsi), %xmm1
> -       movaps  0x0b(%rsi), %xmm2
> -       movaps  0x1b(%rsi), %xmm3
> -       movaps  0x2b(%rsi), %xmm4
> -       movaps  0x3b(%rsi), %xmm5
> -       movaps  0x4b(%rsi), %xmm6
> -       movaps  0x5b(%rsi), %xmm7
> -       movaps  0x6b(%rsi), %xmm8
> -       movaps  0x7b(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $5, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $5, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $5, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $5, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $5, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $5, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $5, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_5)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_5_bwd):
> -       movaps  -0x05(%rsi), %xmm1
> -
> -       movaps  -0x15(%rsi), %xmm2
> -       palignr $5, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x25(%rsi), %xmm3
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x35(%rsi), %xmm4
> -       palignr $5, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x45(%rsi), %xmm5
> -       palignr $5, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x55(%rsi), %xmm6
> -       palignr $5, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x65(%rsi), %xmm7
> -       palignr $5, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x75(%rsi), %xmm8
> -       palignr $5, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x85(%rsi), %xmm9
> -       palignr $5, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_5_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6):
> -       sub     $0x80, %rdx
> -       movaps  -0x06(%rsi), %xmm1
> -       movaps  0x0a(%rsi), %xmm2
> -       movaps  0x1a(%rsi), %xmm3
> -       movaps  0x2a(%rsi), %xmm4
> -       movaps  0x3a(%rsi), %xmm5
> -       movaps  0x4a(%rsi), %xmm6
> -       movaps  0x5a(%rsi), %xmm7
> -       movaps  0x6a(%rsi), %xmm8
> -       movaps  0x7a(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $6, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $6, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $6, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $6, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $6, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $6, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $6, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_6)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_6_bwd):
> -       movaps  -0x06(%rsi), %xmm1
> -
> -       movaps  -0x16(%rsi), %xmm2
> -       palignr $6, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x26(%rsi), %xmm3
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x36(%rsi), %xmm4
> -       palignr $6, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x46(%rsi), %xmm5
> -       palignr $6, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x56(%rsi), %xmm6
> -       palignr $6, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x66(%rsi), %xmm7
> -       palignr $6, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x76(%rsi), %xmm8
> -       palignr $6, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x86(%rsi), %xmm9
> -       palignr $6, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_6_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7):
> -       sub     $0x80, %rdx
> -       movaps  -0x07(%rsi), %xmm1
> -       movaps  0x09(%rsi), %xmm2
> -       movaps  0x19(%rsi), %xmm3
> -       movaps  0x29(%rsi), %xmm4
> -       movaps  0x39(%rsi), %xmm5
> -       movaps  0x49(%rsi), %xmm6
> -       movaps  0x59(%rsi), %xmm7
> -       movaps  0x69(%rsi), %xmm8
> -       movaps  0x79(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $7, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $7, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $7, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $7, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $7, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $7, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $7, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_7)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_7_bwd):
> -       movaps  -0x07(%rsi), %xmm1
> -
> -       movaps  -0x17(%rsi), %xmm2
> -       palignr $7, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x27(%rsi), %xmm3
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x37(%rsi), %xmm4
> -       palignr $7, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x47(%rsi), %xmm5
> -       palignr $7, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x57(%rsi), %xmm6
> -       palignr $7, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x67(%rsi), %xmm7
> -       palignr $7, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x77(%rsi), %xmm8
> -       palignr $7, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x87(%rsi), %xmm9
> -       palignr $7, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_7_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8):
> -       sub     $0x80, %rdx
> -       movaps  -0x08(%rsi), %xmm1
> -       movaps  0x08(%rsi), %xmm2
> -       movaps  0x18(%rsi), %xmm3
> -       movaps  0x28(%rsi), %xmm4
> -       movaps  0x38(%rsi), %xmm5
> -       movaps  0x48(%rsi), %xmm6
> -       movaps  0x58(%rsi), %xmm7
> -       movaps  0x68(%rsi), %xmm8
> -       movaps  0x78(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $8, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $8, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $8, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $8, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $8, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $8, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $8, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_8)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_8_bwd):
> -       movaps  -0x08(%rsi), %xmm1
> -
> -       movaps  -0x18(%rsi), %xmm2
> -       palignr $8, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x28(%rsi), %xmm3
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x38(%rsi), %xmm4
> -       palignr $8, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x48(%rsi), %xmm5
> -       palignr $8, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x58(%rsi), %xmm6
> -       palignr $8, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x68(%rsi), %xmm7
> -       palignr $8, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x78(%rsi), %xmm8
> -       palignr $8, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x88(%rsi), %xmm9
> -       palignr $8, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_8_bwd)
> -L(shl_8_end_bwd):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9):
> -       sub     $0x80, %rdx
> -       movaps  -0x09(%rsi), %xmm1
> -       movaps  0x07(%rsi), %xmm2
> -       movaps  0x17(%rsi), %xmm3
> -       movaps  0x27(%rsi), %xmm4
> -       movaps  0x37(%rsi), %xmm5
> -       movaps  0x47(%rsi), %xmm6
> -       movaps  0x57(%rsi), %xmm7
> -       movaps  0x67(%rsi), %xmm8
> -       movaps  0x77(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $9, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $9, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $9, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $9, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $9, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $9, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $9, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_9)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_9_bwd):
> -       movaps  -0x09(%rsi), %xmm1
> -
> -       movaps  -0x19(%rsi), %xmm2
> -       palignr $9, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x29(%rsi), %xmm3
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x39(%rsi), %xmm4
> -       palignr $9, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x49(%rsi), %xmm5
> -       palignr $9, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x59(%rsi), %xmm6
> -       palignr $9, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x69(%rsi), %xmm7
> -       palignr $9, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x79(%rsi), %xmm8
> -       palignr $9, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x89(%rsi), %xmm9
> -       palignr $9, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_9_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10):
> -       sub     $0x80, %rdx
> -       movaps  -0x0a(%rsi), %xmm1
> -       movaps  0x06(%rsi), %xmm2
> -       movaps  0x16(%rsi), %xmm3
> -       movaps  0x26(%rsi), %xmm4
> -       movaps  0x36(%rsi), %xmm5
> -       movaps  0x46(%rsi), %xmm6
> -       movaps  0x56(%rsi), %xmm7
> -       movaps  0x66(%rsi), %xmm8
> -       movaps  0x76(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $10, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $10, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $10, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $10, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $10, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $10, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $10, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_10)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_10_bwd):
> -       movaps  -0x0a(%rsi), %xmm1
> -
> -       movaps  -0x1a(%rsi), %xmm2
> -       palignr $10, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2a(%rsi), %xmm3
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3a(%rsi), %xmm4
> -       palignr $10, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4a(%rsi), %xmm5
> -       palignr $10, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5a(%rsi), %xmm6
> -       palignr $10, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6a(%rsi), %xmm7
> -       palignr $10, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7a(%rsi), %xmm8
> -       palignr $10, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8a(%rsi), %xmm9
> -       palignr $10, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_10_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11):
> -       sub     $0x80, %rdx
> -       movaps  -0x0b(%rsi), %xmm1
> -       movaps  0x05(%rsi), %xmm2
> -       movaps  0x15(%rsi), %xmm3
> -       movaps  0x25(%rsi), %xmm4
> -       movaps  0x35(%rsi), %xmm5
> -       movaps  0x45(%rsi), %xmm6
> -       movaps  0x55(%rsi), %xmm7
> -       movaps  0x65(%rsi), %xmm8
> -       movaps  0x75(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $11, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $11, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $11, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $11, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $11, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $11, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $11, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_11)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_11_bwd):
> -       movaps  -0x0b(%rsi), %xmm1
> -
> -       movaps  -0x1b(%rsi), %xmm2
> -       palignr $11, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2b(%rsi), %xmm3
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3b(%rsi), %xmm4
> -       palignr $11, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4b(%rsi), %xmm5
> -       palignr $11, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5b(%rsi), %xmm6
> -       palignr $11, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6b(%rsi), %xmm7
> -       palignr $11, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7b(%rsi), %xmm8
> -       palignr $11, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8b(%rsi), %xmm9
> -       palignr $11, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_11_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12):
> -       sub     $0x80, %rdx
> -       movdqa  -0x0c(%rsi), %xmm1
> -       movaps  0x04(%rsi), %xmm2
> -       movaps  0x14(%rsi), %xmm3
> -       movaps  0x24(%rsi), %xmm4
> -       movaps  0x34(%rsi), %xmm5
> -       movaps  0x44(%rsi), %xmm6
> -       movaps  0x54(%rsi), %xmm7
> -       movaps  0x64(%rsi), %xmm8
> -       movaps  0x74(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $12, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $12, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $12, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $12, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $12, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $12, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $12, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_12)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_12_bwd):
> -       movaps  -0x0c(%rsi), %xmm1
> -
> -       movaps  -0x1c(%rsi), %xmm2
> -       palignr $12, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2c(%rsi), %xmm3
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3c(%rsi), %xmm4
> -       palignr $12, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4c(%rsi), %xmm5
> -       palignr $12, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5c(%rsi), %xmm6
> -       palignr $12, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6c(%rsi), %xmm7
> -       palignr $12, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7c(%rsi), %xmm8
> -       palignr $12, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8c(%rsi), %xmm9
> -       palignr $12, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_12_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13):
> -       sub     $0x80, %rdx
> -       movaps  -0x0d(%rsi), %xmm1
> -       movaps  0x03(%rsi), %xmm2
> -       movaps  0x13(%rsi), %xmm3
> -       movaps  0x23(%rsi), %xmm4
> -       movaps  0x33(%rsi), %xmm5
> -       movaps  0x43(%rsi), %xmm6
> -       movaps  0x53(%rsi), %xmm7
> -       movaps  0x63(%rsi), %xmm8
> -       movaps  0x73(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $13, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $13, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $13, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $13, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $13, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $13, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $13, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_13)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_13_bwd):
> -       movaps  -0x0d(%rsi), %xmm1
> -
> -       movaps  -0x1d(%rsi), %xmm2
> -       palignr $13, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2d(%rsi), %xmm3
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3d(%rsi), %xmm4
> -       palignr $13, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4d(%rsi), %xmm5
> -       palignr $13, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5d(%rsi), %xmm6
> -       palignr $13, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6d(%rsi), %xmm7
> -       palignr $13, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7d(%rsi), %xmm8
> -       palignr $13, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8d(%rsi), %xmm9
> -       palignr $13, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_13_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14):
> -       sub     $0x80, %rdx
> -       movaps  -0x0e(%rsi), %xmm1
> -       movaps  0x02(%rsi), %xmm2
> -       movaps  0x12(%rsi), %xmm3
> -       movaps  0x22(%rsi), %xmm4
> -       movaps  0x32(%rsi), %xmm5
> -       movaps  0x42(%rsi), %xmm6
> -       movaps  0x52(%rsi), %xmm7
> -       movaps  0x62(%rsi), %xmm8
> -       movaps  0x72(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $14, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $14, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $14, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $14, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $14, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $14, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $14, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_14)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_14_bwd):
> -       movaps  -0x0e(%rsi), %xmm1
> -
> -       movaps  -0x1e(%rsi), %xmm2
> -       palignr $14, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2e(%rsi), %xmm3
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3e(%rsi), %xmm4
> -       palignr $14, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4e(%rsi), %xmm5
> -       palignr $14, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5e(%rsi), %xmm6
> -       palignr $14, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6e(%rsi), %xmm7
> -       palignr $14, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7e(%rsi), %xmm8
> -       palignr $14, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8e(%rsi), %xmm9
> -       palignr $14, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_14_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15):
> -       sub     $0x80, %rdx
> -       movaps  -0x0f(%rsi), %xmm1
> -       movaps  0x01(%rsi), %xmm2
> -       movaps  0x11(%rsi), %xmm3
> -       movaps  0x21(%rsi), %xmm4
> -       movaps  0x31(%rsi), %xmm5
> -       movaps  0x41(%rsi), %xmm6
> -       movaps  0x51(%rsi), %xmm7
> -       movaps  0x61(%rsi), %xmm8
> -       movaps  0x71(%rsi), %xmm9
> -       lea     0x80(%rsi), %rsi
> -       palignr $15, %xmm8, %xmm9
> -       movaps  %xmm9, 0x70(%rdi)
> -       palignr $15, %xmm7, %xmm8
> -       movaps  %xmm8, 0x60(%rdi)
> -       palignr $15, %xmm6, %xmm7
> -       movaps  %xmm7, 0x50(%rdi)
> -       palignr $15, %xmm5, %xmm6
> -       movaps  %xmm6, 0x40(%rdi)
> -       palignr $15, %xmm4, %xmm5
> -       movaps  %xmm5, 0x30(%rdi)
> -       palignr $15, %xmm3, %xmm4
> -       movaps  %xmm4, 0x20(%rdi)
> -       palignr $15, %xmm2, %xmm3
> -       movaps  %xmm3, 0x10(%rdi)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdi)
> -       lea     0x80(%rdi), %rdi
> -       jae     L(shl_15)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       add     %rdx, %rdi
> -       add     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(shl_15_bwd):
> -       movaps  -0x0f(%rsi), %xmm1
> -
> -       movaps  -0x1f(%rsi), %xmm2
> -       palignr $15, %xmm2, %xmm1
> -       movaps  %xmm1, -0x10(%rdi)
> -
> -       movaps  -0x2f(%rsi), %xmm3
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, -0x20(%rdi)
> -
> -       movaps  -0x3f(%rsi), %xmm4
> -       palignr $15, %xmm4, %xmm3
> -       movaps  %xmm3, -0x30(%rdi)
> -
> -       movaps  -0x4f(%rsi), %xmm5
> -       palignr $15, %xmm5, %xmm4
> -       movaps  %xmm4, -0x40(%rdi)
> -
> -       movaps  -0x5f(%rsi), %xmm6
> -       palignr $15, %xmm6, %xmm5
> -       movaps  %xmm5, -0x50(%rdi)
> -
> -       movaps  -0x6f(%rsi), %xmm7
> -       palignr $15, %xmm7, %xmm6
> -       movaps  %xmm6, -0x60(%rdi)
> -
> -       movaps  -0x7f(%rsi), %xmm8
> -       palignr $15, %xmm8, %xmm7
> -       movaps  %xmm7, -0x70(%rdi)
> -
> -       movaps  -0x8f(%rsi), %xmm9
> -       palignr $15, %xmm9, %xmm8
> -       movaps  %xmm8, -0x80(%rdi)
> -
> -       sub     $0x80, %rdx
> -       lea     -0x80(%rdi), %rdi
> -       lea     -0x80(%rsi), %rsi
> -       jae     L(shl_15_bwd)
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_fwd):
> -       movdqu  (%rsi), %xmm1
> -       movdqu  %xmm0, (%r8)
> -       movdqa  %xmm1, (%rdi)
> -       sub     $16, %rdx
> -       add     $16, %rsi
> -       add     $16, %rdi
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rsi, %r9
> -       sub     %rdi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_fwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_fwd_start)
> -L(memmove_is_memcpy_fwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger_in_fwd)
> -       mov     %rdx, %rcx
> -L(bigger_in_fwd):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy_fwd)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy_fwd)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy_fwd):
> -       sub     $0x80, %rdx
> -L(gobble_mem_fwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 0x200(%rsi)
> -       prefetcht0 0x300(%rsi)
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       lfence
> -       movntdq %xmm0, (%rdi)
> -       movntdq %xmm1, 0x10(%rdi)
> -       movntdq %xmm2, 0x20(%rdi)
> -       movntdq %xmm3, 0x30(%rdi)
> -       movntdq %xmm4, 0x40(%rdi)
> -       movntdq %xmm5, 0x50(%rdi)
> -       movntdq %xmm6, 0x60(%rdi)
> -       movntdq %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_mem_fwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_fwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy_fwd):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_fwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop_fwd):
> -       prefetchnta 0x1c0(%rsi)
> -       prefetchnta 0x280(%rsi)
> -       prefetchnta 0x1c0(%rdi)
> -       prefetchnta 0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  (%rsi), %xmm0
> -       movdqu  0x10(%rsi), %xmm1
> -       movdqu  0x20(%rsi), %xmm2
> -       movdqu  0x30(%rsi), %xmm3
> -       movdqu  0x40(%rsi), %xmm4
> -       movdqu  0x50(%rsi), %xmm5
> -       movdqu  0x60(%rsi), %xmm6
> -       movdqu  0x70(%rsi), %xmm7
> -       movdqa  %xmm0, (%rdi)
> -       movdqa  %xmm1, 0x10(%rdi)
> -       movdqa  %xmm2, 0x20(%rdi)
> -       movdqa  %xmm3, 0x30(%rdi)
> -       movdqa  %xmm4, 0x40(%rdi)
> -       movdqa  %xmm5, 0x50(%rdi)
> -       movdqa  %xmm6, 0x60(%rdi)
> -       movdqa  %xmm7, 0x70(%rdi)
> -       lea     0x80(%rsi), %rsi
> -       lea     0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop_fwd)
> -L(gobble_mem_fwd_end):
> -       add     $0x80, %rdx
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_fwd), %rdx, 4)
> -
> -       .p2align 4
> -L(gobble_mem_bwd):
> -       add     %rdx, %rsi
> -       add     %rdx, %rdi
> -
> -       movdqu  -16(%rsi), %xmm0
> -       lea     -16(%rdi), %r8
> -       mov     %rdi, %r9
> -       and     $-16, %rdi
> -       sub     %rdi, %r9
> -       sub     %r9, %rsi
> -       sub     %r9, %rdx
> -
> -
> -#ifdef SHARED_CACHE_SIZE_HALF
> -       mov     $SHARED_CACHE_SIZE_HALF, %RCX_LP
> -#else
> -       mov     __x86_shared_cache_size_half(%rip), %RCX_LP
> -#endif
> -#ifdef USE_AS_MEMMOVE
> -       mov     %rdi, %r9
> -       sub     %rsi, %r9
> -       cmp     %rdx, %r9
> -       jae     L(memmove_is_memcpy_bwd)
> -       cmp     %rcx, %r9
> -       jbe     L(ll_cache_copy_bwd_start)
> -L(memmove_is_memcpy_bwd):
> -#endif
> -       cmp     %rcx, %rdx
> -       ja      L(bigger)
> -       mov     %rdx, %rcx
> -L(bigger):
> -       sub     %rcx, %rdx
> -       cmp     $0x1000, %rdx
> -       jbe     L(ll_cache_copy)
> -
> -       mov     %rcx, %r9
> -       shl     $3, %r9
> -       cmp     %r9, %rdx
> -       jbe     L(2steps_copy)
> -       add     %rcx, %rdx
> -       xor     %rcx, %rcx
> -L(2steps_copy):
> -       sub     $0x80, %rdx
> -L(gobble_mem_bwd_loop):
> -       sub     $0x80, %rdx
> -       prefetcht0 -0x200(%rsi)
> -       prefetcht0 -0x300(%rsi)
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       lfence
> -       movntdq %xmm1, -0x10(%rdi)
> -       movntdq %xmm2, -0x20(%rdi)
> -       movntdq %xmm3, -0x30(%rdi)
> -       movntdq %xmm4, -0x40(%rdi)
> -       movntdq %xmm5, -0x50(%rdi)
> -       movntdq %xmm6, -0x60(%rdi)
> -       movntdq %xmm7, -0x70(%rdi)
> -       movntdq %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_mem_bwd_loop)
> -       sfence
> -       cmp     $0x80, %rcx
> -       jb      L(gobble_mem_bwd_end)
> -       add     $0x80, %rdx
> -L(ll_cache_copy):
> -       add     %rcx, %rdx
> -L(ll_cache_copy_bwd_start):
> -       sub     $0x80, %rdx
> -L(gobble_ll_loop):
> -       prefetchnta -0x1c0(%rsi)
> -       prefetchnta -0x280(%rsi)
> -       prefetchnta -0x1c0(%rdi)
> -       prefetchnta -0x280(%rdi)
> -       sub     $0x80, %rdx
> -       movdqu  -0x10(%rsi), %xmm1
> -       movdqu  -0x20(%rsi), %xmm2
> -       movdqu  -0x30(%rsi), %xmm3
> -       movdqu  -0x40(%rsi), %xmm4
> -       movdqu  -0x50(%rsi), %xmm5
> -       movdqu  -0x60(%rsi), %xmm6
> -       movdqu  -0x70(%rsi), %xmm7
> -       movdqu  -0x80(%rsi), %xmm8
> -       movdqa  %xmm1, -0x10(%rdi)
> -       movdqa  %xmm2, -0x20(%rdi)
> -       movdqa  %xmm3, -0x30(%rdi)
> -       movdqa  %xmm4, -0x40(%rdi)
> -       movdqa  %xmm5, -0x50(%rdi)
> -       movdqa  %xmm6, -0x60(%rdi)
> -       movdqa  %xmm7, -0x70(%rdi)
> -       movdqa  %xmm8, -0x80(%rdi)
> -       lea     -0x80(%rsi), %rsi
> -       lea     -0x80(%rdi), %rdi
> -       jae     L(gobble_ll_loop)
> -L(gobble_mem_bwd_end):
> -       movdqu  %xmm0, (%r8)
> -       add     $0x80, %rdx
> -       sub     %rdx, %rsi
> -       sub     %rdx, %rdi
> -       BRANCH_TO_JMPTBL_ENTRY (L(table_144_bytes_bwd), %rdx, 4)
> -
> -       .p2align 4
> -L(fwd_write_128bytes):
> -       lddqu   -128(%rsi), %xmm0
> -       movdqu  %xmm0, -128(%rdi)
> -L(fwd_write_112bytes):
> -       lddqu   -112(%rsi), %xmm0
> -       movdqu  %xmm0, -112(%rdi)
> -L(fwd_write_96bytes):
> -       lddqu   -96(%rsi), %xmm0
> -       movdqu  %xmm0, -96(%rdi)
> -L(fwd_write_80bytes):
> -       lddqu   -80(%rsi), %xmm0
> -       movdqu  %xmm0, -80(%rdi)
> -L(fwd_write_64bytes):
> -       lddqu   -64(%rsi), %xmm0
> -       movdqu  %xmm0, -64(%rdi)
> -L(fwd_write_48bytes):
> -       lddqu   -48(%rsi), %xmm0
> -       movdqu  %xmm0, -48(%rdi)
> -L(fwd_write_32bytes):
> -       lddqu   -32(%rsi), %xmm0
> -       movdqu  %xmm0, -32(%rdi)
> -L(fwd_write_16bytes):
> -       lddqu   -16(%rsi), %xmm0
> -       movdqu  %xmm0, -16(%rdi)
> -L(fwd_write_0bytes):
> -       ret
> -
> -
> -       .p2align 4
> -L(fwd_write_143bytes):
> -       lddqu   -143(%rsi), %xmm0
> -       movdqu  %xmm0, -143(%rdi)
> -L(fwd_write_127bytes):
> -       lddqu   -127(%rsi), %xmm0
> -       movdqu  %xmm0, -127(%rdi)
> -L(fwd_write_111bytes):
> -       lddqu   -111(%rsi), %xmm0
> -       movdqu  %xmm0, -111(%rdi)
> -L(fwd_write_95bytes):
> -       lddqu   -95(%rsi), %xmm0
> -       movdqu  %xmm0, -95(%rdi)
> -L(fwd_write_79bytes):
> -       lddqu   -79(%rsi), %xmm0
> -       movdqu  %xmm0, -79(%rdi)
> -L(fwd_write_63bytes):
> -       lddqu   -63(%rsi), %xmm0
> -       movdqu  %xmm0, -63(%rdi)
> -L(fwd_write_47bytes):
> -       lddqu   -47(%rsi), %xmm0
> -       movdqu  %xmm0, -47(%rdi)
> -L(fwd_write_31bytes):
> -       lddqu   -31(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -31(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_15bytes):
> -       mov     -15(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -15(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_142bytes):
> -       lddqu   -142(%rsi), %xmm0
> -       movdqu  %xmm0, -142(%rdi)
> -L(fwd_write_126bytes):
> -       lddqu   -126(%rsi), %xmm0
> -       movdqu  %xmm0, -126(%rdi)
> -L(fwd_write_110bytes):
> -       lddqu   -110(%rsi), %xmm0
> -       movdqu  %xmm0, -110(%rdi)
> -L(fwd_write_94bytes):
> -       lddqu   -94(%rsi), %xmm0
> -       movdqu  %xmm0, -94(%rdi)
> -L(fwd_write_78bytes):
> -       lddqu   -78(%rsi), %xmm0
> -       movdqu  %xmm0, -78(%rdi)
> -L(fwd_write_62bytes):
> -       lddqu   -62(%rsi), %xmm0
> -       movdqu  %xmm0, -62(%rdi)
> -L(fwd_write_46bytes):
> -       lddqu   -46(%rsi), %xmm0
> -       movdqu  %xmm0, -46(%rdi)
> -L(fwd_write_30bytes):
> -       lddqu   -30(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -30(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_14bytes):
> -       mov     -14(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -14(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_141bytes):
> -       lddqu   -141(%rsi), %xmm0
> -       movdqu  %xmm0, -141(%rdi)
> -L(fwd_write_125bytes):
> -       lddqu   -125(%rsi), %xmm0
> -       movdqu  %xmm0, -125(%rdi)
> -L(fwd_write_109bytes):
> -       lddqu   -109(%rsi), %xmm0
> -       movdqu  %xmm0, -109(%rdi)
> -L(fwd_write_93bytes):
> -       lddqu   -93(%rsi), %xmm0
> -       movdqu  %xmm0, -93(%rdi)
> -L(fwd_write_77bytes):
> -       lddqu   -77(%rsi), %xmm0
> -       movdqu  %xmm0, -77(%rdi)
> -L(fwd_write_61bytes):
> -       lddqu   -61(%rsi), %xmm0
> -       movdqu  %xmm0, -61(%rdi)
> -L(fwd_write_45bytes):
> -       lddqu   -45(%rsi), %xmm0
> -       movdqu  %xmm0, -45(%rdi)
> -L(fwd_write_29bytes):
> -       lddqu   -29(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -29(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_13bytes):
> -       mov     -13(%rsi), %rdx
> -       mov     -8(%rsi), %rcx
> -       mov     %rdx, -13(%rdi)
> -       mov     %rcx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_140bytes):
> -       lddqu   -140(%rsi), %xmm0
> -       movdqu  %xmm0, -140(%rdi)
> -L(fwd_write_124bytes):
> -       lddqu   -124(%rsi), %xmm0
> -       movdqu  %xmm0, -124(%rdi)
> -L(fwd_write_108bytes):
> -       lddqu   -108(%rsi), %xmm0
> -       movdqu  %xmm0, -108(%rdi)
> -L(fwd_write_92bytes):
> -       lddqu   -92(%rsi), %xmm0
> -       movdqu  %xmm0, -92(%rdi)
> -L(fwd_write_76bytes):
> -       lddqu   -76(%rsi), %xmm0
> -       movdqu  %xmm0, -76(%rdi)
> -L(fwd_write_60bytes):
> -       lddqu   -60(%rsi), %xmm0
> -       movdqu  %xmm0, -60(%rdi)
> -L(fwd_write_44bytes):
> -       lddqu   -44(%rsi), %xmm0
> -       movdqu  %xmm0, -44(%rdi)
> -L(fwd_write_28bytes):
> -       lddqu   -28(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -28(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_12bytes):
> -       mov     -12(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -12(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_139bytes):
> -       lddqu   -139(%rsi), %xmm0
> -       movdqu  %xmm0, -139(%rdi)
> -L(fwd_write_123bytes):
> -       lddqu   -123(%rsi), %xmm0
> -       movdqu  %xmm0, -123(%rdi)
> -L(fwd_write_107bytes):
> -       lddqu   -107(%rsi), %xmm0
> -       movdqu  %xmm0, -107(%rdi)
> -L(fwd_write_91bytes):
> -       lddqu   -91(%rsi), %xmm0
> -       movdqu  %xmm0, -91(%rdi)
> -L(fwd_write_75bytes):
> -       lddqu   -75(%rsi), %xmm0
> -       movdqu  %xmm0, -75(%rdi)
> -L(fwd_write_59bytes):
> -       lddqu   -59(%rsi), %xmm0
> -       movdqu  %xmm0, -59(%rdi)
> -L(fwd_write_43bytes):
> -       lddqu   -43(%rsi), %xmm0
> -       movdqu  %xmm0, -43(%rdi)
> -L(fwd_write_27bytes):
> -       lddqu   -27(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -27(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_11bytes):
> -       mov     -11(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -11(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_138bytes):
> -       lddqu   -138(%rsi), %xmm0
> -       movdqu  %xmm0, -138(%rdi)
> -L(fwd_write_122bytes):
> -       lddqu   -122(%rsi), %xmm0
> -       movdqu  %xmm0, -122(%rdi)
> -L(fwd_write_106bytes):
> -       lddqu   -106(%rsi), %xmm0
> -       movdqu  %xmm0, -106(%rdi)
> -L(fwd_write_90bytes):
> -       lddqu   -90(%rsi), %xmm0
> -       movdqu  %xmm0, -90(%rdi)
> -L(fwd_write_74bytes):
> -       lddqu   -74(%rsi), %xmm0
> -       movdqu  %xmm0, -74(%rdi)
> -L(fwd_write_58bytes):
> -       lddqu   -58(%rsi), %xmm0
> -       movdqu  %xmm0, -58(%rdi)
> -L(fwd_write_42bytes):
> -       lddqu   -42(%rsi), %xmm0
> -       movdqu  %xmm0, -42(%rdi)
> -L(fwd_write_26bytes):
> -       lddqu   -26(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -26(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_10bytes):
> -       mov     -10(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -10(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_137bytes):
> -       lddqu   -137(%rsi), %xmm0
> -       movdqu  %xmm0, -137(%rdi)
> -L(fwd_write_121bytes):
> -       lddqu   -121(%rsi), %xmm0
> -       movdqu  %xmm0, -121(%rdi)
> -L(fwd_write_105bytes):
> -       lddqu   -105(%rsi), %xmm0
> -       movdqu  %xmm0, -105(%rdi)
> -L(fwd_write_89bytes):
> -       lddqu   -89(%rsi), %xmm0
> -       movdqu  %xmm0, -89(%rdi)
> -L(fwd_write_73bytes):
> -       lddqu   -73(%rsi), %xmm0
> -       movdqu  %xmm0, -73(%rdi)
> -L(fwd_write_57bytes):
> -       lddqu   -57(%rsi), %xmm0
> -       movdqu  %xmm0, -57(%rdi)
> -L(fwd_write_41bytes):
> -       lddqu   -41(%rsi), %xmm0
> -       movdqu  %xmm0, -41(%rdi)
> -L(fwd_write_25bytes):
> -       lddqu   -25(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -25(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_9bytes):
> -       mov     -9(%rsi), %rdx
> -       mov     -4(%rsi), %ecx
> -       mov     %rdx, -9(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_136bytes):
> -       lddqu   -136(%rsi), %xmm0
> -       movdqu  %xmm0, -136(%rdi)
> -L(fwd_write_120bytes):
> -       lddqu   -120(%rsi), %xmm0
> -       movdqu  %xmm0, -120(%rdi)
> -L(fwd_write_104bytes):
> -       lddqu   -104(%rsi), %xmm0
> -       movdqu  %xmm0, -104(%rdi)
> -L(fwd_write_88bytes):
> -       lddqu   -88(%rsi), %xmm0
> -       movdqu  %xmm0, -88(%rdi)
> -L(fwd_write_72bytes):
> -       lddqu   -72(%rsi), %xmm0
> -       movdqu  %xmm0, -72(%rdi)
> -L(fwd_write_56bytes):
> -       lddqu   -56(%rsi), %xmm0
> -       movdqu  %xmm0, -56(%rdi)
> -L(fwd_write_40bytes):
> -       lddqu   -40(%rsi), %xmm0
> -       movdqu  %xmm0, -40(%rdi)
> -L(fwd_write_24bytes):
> -       lddqu   -24(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -24(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_8bytes):
> -       mov     -8(%rsi), %rdx
> -       mov     %rdx, -8(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_135bytes):
> -       lddqu   -135(%rsi), %xmm0
> -       movdqu  %xmm0, -135(%rdi)
> -L(fwd_write_119bytes):
> -       lddqu   -119(%rsi), %xmm0
> -       movdqu  %xmm0, -119(%rdi)
> -L(fwd_write_103bytes):
> -       lddqu   -103(%rsi), %xmm0
> -       movdqu  %xmm0, -103(%rdi)
> -L(fwd_write_87bytes):
> -       lddqu   -87(%rsi), %xmm0
> -       movdqu  %xmm0, -87(%rdi)
> -L(fwd_write_71bytes):
> -       lddqu   -71(%rsi), %xmm0
> -       movdqu  %xmm0, -71(%rdi)
> -L(fwd_write_55bytes):
> -       lddqu   -55(%rsi), %xmm0
> -       movdqu  %xmm0, -55(%rdi)
> -L(fwd_write_39bytes):
> -       lddqu   -39(%rsi), %xmm0
> -       movdqu  %xmm0, -39(%rdi)
> -L(fwd_write_23bytes):
> -       lddqu   -23(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -23(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_7bytes):
> -       mov     -7(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -7(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_134bytes):
> -       lddqu   -134(%rsi), %xmm0
> -       movdqu  %xmm0, -134(%rdi)
> -L(fwd_write_118bytes):
> -       lddqu   -118(%rsi), %xmm0
> -       movdqu  %xmm0, -118(%rdi)
> -L(fwd_write_102bytes):
> -       lddqu   -102(%rsi), %xmm0
> -       movdqu  %xmm0, -102(%rdi)
> -L(fwd_write_86bytes):
> -       lddqu   -86(%rsi), %xmm0
> -       movdqu  %xmm0, -86(%rdi)
> -L(fwd_write_70bytes):
> -       lddqu   -70(%rsi), %xmm0
> -       movdqu  %xmm0, -70(%rdi)
> -L(fwd_write_54bytes):
> -       lddqu   -54(%rsi), %xmm0
> -       movdqu  %xmm0, -54(%rdi)
> -L(fwd_write_38bytes):
> -       lddqu   -38(%rsi), %xmm0
> -       movdqu  %xmm0, -38(%rdi)
> -L(fwd_write_22bytes):
> -       lddqu   -22(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -22(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_6bytes):
> -       mov     -6(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -6(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_133bytes):
> -       lddqu   -133(%rsi), %xmm0
> -       movdqu  %xmm0, -133(%rdi)
> -L(fwd_write_117bytes):
> -       lddqu   -117(%rsi), %xmm0
> -       movdqu  %xmm0, -117(%rdi)
> -L(fwd_write_101bytes):
> -       lddqu   -101(%rsi), %xmm0
> -       movdqu  %xmm0, -101(%rdi)
> -L(fwd_write_85bytes):
> -       lddqu   -85(%rsi), %xmm0
> -       movdqu  %xmm0, -85(%rdi)
> -L(fwd_write_69bytes):
> -       lddqu   -69(%rsi), %xmm0
> -       movdqu  %xmm0, -69(%rdi)
> -L(fwd_write_53bytes):
> -       lddqu   -53(%rsi), %xmm0
> -       movdqu  %xmm0, -53(%rdi)
> -L(fwd_write_37bytes):
> -       lddqu   -37(%rsi), %xmm0
> -       movdqu  %xmm0, -37(%rdi)
> -L(fwd_write_21bytes):
> -       lddqu   -21(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -21(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_5bytes):
> -       mov     -5(%rsi), %edx
> -       mov     -4(%rsi), %ecx
> -       mov     %edx, -5(%rdi)
> -       mov     %ecx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_132bytes):
> -       lddqu   -132(%rsi), %xmm0
> -       movdqu  %xmm0, -132(%rdi)
> -L(fwd_write_116bytes):
> -       lddqu   -116(%rsi), %xmm0
> -       movdqu  %xmm0, -116(%rdi)
> -L(fwd_write_100bytes):
> -       lddqu   -100(%rsi), %xmm0
> -       movdqu  %xmm0, -100(%rdi)
> -L(fwd_write_84bytes):
> -       lddqu   -84(%rsi), %xmm0
> -       movdqu  %xmm0, -84(%rdi)
> -L(fwd_write_68bytes):
> -       lddqu   -68(%rsi), %xmm0
> -       movdqu  %xmm0, -68(%rdi)
> -L(fwd_write_52bytes):
> -       lddqu   -52(%rsi), %xmm0
> -       movdqu  %xmm0, -52(%rdi)
> -L(fwd_write_36bytes):
> -       lddqu   -36(%rsi), %xmm0
> -       movdqu  %xmm0, -36(%rdi)
> -L(fwd_write_20bytes):
> -       lddqu   -20(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -20(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_4bytes):
> -       mov     -4(%rsi), %edx
> -       mov     %edx, -4(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_131bytes):
> -       lddqu   -131(%rsi), %xmm0
> -       movdqu  %xmm0, -131(%rdi)
> -L(fwd_write_115bytes):
> -       lddqu   -115(%rsi), %xmm0
> -       movdqu  %xmm0, -115(%rdi)
> -L(fwd_write_99bytes):
> -       lddqu   -99(%rsi), %xmm0
> -       movdqu  %xmm0, -99(%rdi)
> -L(fwd_write_83bytes):
> -       lddqu   -83(%rsi), %xmm0
> -       movdqu  %xmm0, -83(%rdi)
> -L(fwd_write_67bytes):
> -       lddqu   -67(%rsi), %xmm0
> -       movdqu  %xmm0, -67(%rdi)
> -L(fwd_write_51bytes):
> -       lddqu   -51(%rsi), %xmm0
> -       movdqu  %xmm0, -51(%rdi)
> -L(fwd_write_35bytes):
> -       lddqu   -35(%rsi), %xmm0
> -       movdqu  %xmm0, -35(%rdi)
> -L(fwd_write_19bytes):
> -       lddqu   -19(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -19(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_3bytes):
> -       mov     -3(%rsi), %dx
> -       mov     -2(%rsi), %cx
> -       mov     %dx, -3(%rdi)
> -       mov     %cx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_130bytes):
> -       lddqu   -130(%rsi), %xmm0
> -       movdqu  %xmm0, -130(%rdi)
> -L(fwd_write_114bytes):
> -       lddqu   -114(%rsi), %xmm0
> -       movdqu  %xmm0, -114(%rdi)
> -L(fwd_write_98bytes):
> -       lddqu   -98(%rsi), %xmm0
> -       movdqu  %xmm0, -98(%rdi)
> -L(fwd_write_82bytes):
> -       lddqu   -82(%rsi), %xmm0
> -       movdqu  %xmm0, -82(%rdi)
> -L(fwd_write_66bytes):
> -       lddqu   -66(%rsi), %xmm0
> -       movdqu  %xmm0, -66(%rdi)
> -L(fwd_write_50bytes):
> -       lddqu   -50(%rsi), %xmm0
> -       movdqu  %xmm0, -50(%rdi)
> -L(fwd_write_34bytes):
> -       lddqu   -34(%rsi), %xmm0
> -       movdqu  %xmm0, -34(%rdi)
> -L(fwd_write_18bytes):
> -       lddqu   -18(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -18(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_2bytes):
> -       movzwl  -2(%rsi), %edx
> -       mov     %dx, -2(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_129bytes):
> -       lddqu   -129(%rsi), %xmm0
> -       movdqu  %xmm0, -129(%rdi)
> -L(fwd_write_113bytes):
> -       lddqu   -113(%rsi), %xmm0
> -       movdqu  %xmm0, -113(%rdi)
> -L(fwd_write_97bytes):
> -       lddqu   -97(%rsi), %xmm0
> -       movdqu  %xmm0, -97(%rdi)
> -L(fwd_write_81bytes):
> -       lddqu   -81(%rsi), %xmm0
> -       movdqu  %xmm0, -81(%rdi)
> -L(fwd_write_65bytes):
> -       lddqu   -65(%rsi), %xmm0
> -       movdqu  %xmm0, -65(%rdi)
> -L(fwd_write_49bytes):
> -       lddqu   -49(%rsi), %xmm0
> -       movdqu  %xmm0, -49(%rdi)
> -L(fwd_write_33bytes):
> -       lddqu   -33(%rsi), %xmm0
> -       movdqu  %xmm0, -33(%rdi)
> -L(fwd_write_17bytes):
> -       lddqu   -17(%rsi), %xmm0
> -       lddqu   -16(%rsi), %xmm1
> -       movdqu  %xmm0, -17(%rdi)
> -       movdqu  %xmm1, -16(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(fwd_write_1bytes):
> -       movzbl  -1(%rsi), %edx
> -       mov     %dl, -1(%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_128bytes):
> -       lddqu   112(%rsi), %xmm0
> -       movdqu  %xmm0, 112(%rdi)
> -L(bwd_write_112bytes):
> -       lddqu   96(%rsi), %xmm0
> -       movdqu  %xmm0, 96(%rdi)
> -L(bwd_write_96bytes):
> -       lddqu   80(%rsi), %xmm0
> -       movdqu  %xmm0, 80(%rdi)
> -L(bwd_write_80bytes):
> -       lddqu   64(%rsi), %xmm0
> -       movdqu  %xmm0, 64(%rdi)
> -L(bwd_write_64bytes):
> -       lddqu   48(%rsi), %xmm0
> -       movdqu  %xmm0, 48(%rdi)
> -L(bwd_write_48bytes):
> -       lddqu   32(%rsi), %xmm0
> -       movdqu  %xmm0, 32(%rdi)
> -L(bwd_write_32bytes):
> -       lddqu   16(%rsi), %xmm0
> -       movdqu  %xmm0, 16(%rdi)
> -L(bwd_write_16bytes):
> -       lddqu   (%rsi), %xmm0
> -       movdqu  %xmm0, (%rdi)
> -L(bwd_write_0bytes):
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_143bytes):
> -       lddqu   127(%rsi), %xmm0
> -       movdqu  %xmm0, 127(%rdi)
> -L(bwd_write_127bytes):
> -       lddqu   111(%rsi), %xmm0
> -       movdqu  %xmm0, 111(%rdi)
> -L(bwd_write_111bytes):
> -       lddqu   95(%rsi), %xmm0
> -       movdqu  %xmm0, 95(%rdi)
> -L(bwd_write_95bytes):
> -       lddqu   79(%rsi), %xmm0
> -       movdqu  %xmm0, 79(%rdi)
> -L(bwd_write_79bytes):
> -       lddqu   63(%rsi), %xmm0
> -       movdqu  %xmm0, 63(%rdi)
> -L(bwd_write_63bytes):
> -       lddqu   47(%rsi), %xmm0
> -       movdqu  %xmm0, 47(%rdi)
> -L(bwd_write_47bytes):
> -       lddqu   31(%rsi), %xmm0
> -       movdqu  %xmm0, 31(%rdi)
> -L(bwd_write_31bytes):
> -       lddqu   15(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 15(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -
> -       .p2align 4
> -L(bwd_write_15bytes):
> -       mov     7(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 7(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_142bytes):
> -       lddqu   126(%rsi), %xmm0
> -       movdqu  %xmm0, 126(%rdi)
> -L(bwd_write_126bytes):
> -       lddqu   110(%rsi), %xmm0
> -       movdqu  %xmm0, 110(%rdi)
> -L(bwd_write_110bytes):
> -       lddqu   94(%rsi), %xmm0
> -       movdqu  %xmm0, 94(%rdi)
> -L(bwd_write_94bytes):
> -       lddqu   78(%rsi), %xmm0
> -       movdqu  %xmm0, 78(%rdi)
> -L(bwd_write_78bytes):
> -       lddqu   62(%rsi), %xmm0
> -       movdqu  %xmm0, 62(%rdi)
> -L(bwd_write_62bytes):
> -       lddqu   46(%rsi), %xmm0
> -       movdqu  %xmm0, 46(%rdi)
> -L(bwd_write_46bytes):
> -       lddqu   30(%rsi), %xmm0
> -       movdqu  %xmm0, 30(%rdi)
> -L(bwd_write_30bytes):
> -       lddqu   14(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 14(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_14bytes):
> -       mov     6(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 6(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_141bytes):
> -       lddqu   125(%rsi), %xmm0
> -       movdqu  %xmm0, 125(%rdi)
> -L(bwd_write_125bytes):
> -       lddqu   109(%rsi), %xmm0
> -       movdqu  %xmm0, 109(%rdi)
> -L(bwd_write_109bytes):
> -       lddqu   93(%rsi), %xmm0
> -       movdqu  %xmm0, 93(%rdi)
> -L(bwd_write_93bytes):
> -       lddqu   77(%rsi), %xmm0
> -       movdqu  %xmm0, 77(%rdi)
> -L(bwd_write_77bytes):
> -       lddqu   61(%rsi), %xmm0
> -       movdqu  %xmm0, 61(%rdi)
> -L(bwd_write_61bytes):
> -       lddqu   45(%rsi), %xmm0
> -       movdqu  %xmm0, 45(%rdi)
> -L(bwd_write_45bytes):
> -       lddqu   29(%rsi), %xmm0
> -       movdqu  %xmm0, 29(%rdi)
> -L(bwd_write_29bytes):
> -       lddqu   13(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 13(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_13bytes):
> -       mov     5(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 5(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_140bytes):
> -       lddqu   124(%rsi), %xmm0
> -       movdqu  %xmm0, 124(%rdi)
> -L(bwd_write_124bytes):
> -       lddqu   108(%rsi), %xmm0
> -       movdqu  %xmm0, 108(%rdi)
> -L(bwd_write_108bytes):
> -       lddqu   92(%rsi), %xmm0
> -       movdqu  %xmm0, 92(%rdi)
> -L(bwd_write_92bytes):
> -       lddqu   76(%rsi), %xmm0
> -       movdqu  %xmm0, 76(%rdi)
> -L(bwd_write_76bytes):
> -       lddqu   60(%rsi), %xmm0
> -       movdqu  %xmm0, 60(%rdi)
> -L(bwd_write_60bytes):
> -       lddqu   44(%rsi), %xmm0
> -       movdqu  %xmm0, 44(%rdi)
> -L(bwd_write_44bytes):
> -       lddqu   28(%rsi), %xmm0
> -       movdqu  %xmm0, 28(%rdi)
> -L(bwd_write_28bytes):
> -       lddqu   12(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 12(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_12bytes):
> -       mov     4(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 4(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_139bytes):
> -       lddqu   123(%rsi), %xmm0
> -       movdqu  %xmm0, 123(%rdi)
> -L(bwd_write_123bytes):
> -       lddqu   107(%rsi), %xmm0
> -       movdqu  %xmm0, 107(%rdi)
> -L(bwd_write_107bytes):
> -       lddqu   91(%rsi), %xmm0
> -       movdqu  %xmm0, 91(%rdi)
> -L(bwd_write_91bytes):
> -       lddqu   75(%rsi), %xmm0
> -       movdqu  %xmm0, 75(%rdi)
> -L(bwd_write_75bytes):
> -       lddqu   59(%rsi), %xmm0
> -       movdqu  %xmm0, 59(%rdi)
> -L(bwd_write_59bytes):
> -       lddqu   43(%rsi), %xmm0
> -       movdqu  %xmm0, 43(%rdi)
> -L(bwd_write_43bytes):
> -       lddqu   27(%rsi), %xmm0
> -       movdqu  %xmm0, 27(%rdi)
> -L(bwd_write_27bytes):
> -       lddqu   11(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 11(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_11bytes):
> -       mov     3(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 3(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_138bytes):
> -       lddqu   122(%rsi), %xmm0
> -       movdqu  %xmm0, 122(%rdi)
> -L(bwd_write_122bytes):
> -       lddqu   106(%rsi), %xmm0
> -       movdqu  %xmm0, 106(%rdi)
> -L(bwd_write_106bytes):
> -       lddqu   90(%rsi), %xmm0
> -       movdqu  %xmm0, 90(%rdi)
> -L(bwd_write_90bytes):
> -       lddqu   74(%rsi), %xmm0
> -       movdqu  %xmm0, 74(%rdi)
> -L(bwd_write_74bytes):
> -       lddqu   58(%rsi), %xmm0
> -       movdqu  %xmm0, 58(%rdi)
> -L(bwd_write_58bytes):
> -       lddqu   42(%rsi), %xmm0
> -       movdqu  %xmm0, 42(%rdi)
> -L(bwd_write_42bytes):
> -       lddqu   26(%rsi), %xmm0
> -       movdqu  %xmm0, 26(%rdi)
> -L(bwd_write_26bytes):
> -       lddqu   10(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 10(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_10bytes):
> -       mov     2(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 2(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_137bytes):
> -       lddqu   121(%rsi), %xmm0
> -       movdqu  %xmm0, 121(%rdi)
> -L(bwd_write_121bytes):
> -       lddqu   105(%rsi), %xmm0
> -       movdqu  %xmm0, 105(%rdi)
> -L(bwd_write_105bytes):
> -       lddqu   89(%rsi), %xmm0
> -       movdqu  %xmm0, 89(%rdi)
> -L(bwd_write_89bytes):
> -       lddqu   73(%rsi), %xmm0
> -       movdqu  %xmm0, 73(%rdi)
> -L(bwd_write_73bytes):
> -       lddqu   57(%rsi), %xmm0
> -       movdqu  %xmm0, 57(%rdi)
> -L(bwd_write_57bytes):
> -       lddqu   41(%rsi), %xmm0
> -       movdqu  %xmm0, 41(%rdi)
> -L(bwd_write_41bytes):
> -       lddqu   25(%rsi), %xmm0
> -       movdqu  %xmm0, 25(%rdi)
> -L(bwd_write_25bytes):
> -       lddqu   9(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 9(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_9bytes):
> -       mov     1(%rsi), %rdx
> -       mov     (%rsi), %rcx
> -       mov     %rdx, 1(%rdi)
> -       mov     %rcx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_136bytes):
> -       lddqu   120(%rsi), %xmm0
> -       movdqu  %xmm0, 120(%rdi)
> -L(bwd_write_120bytes):
> -       lddqu   104(%rsi), %xmm0
> -       movdqu  %xmm0, 104(%rdi)
> -L(bwd_write_104bytes):
> -       lddqu   88(%rsi), %xmm0
> -       movdqu  %xmm0, 88(%rdi)
> -L(bwd_write_88bytes):
> -       lddqu   72(%rsi), %xmm0
> -       movdqu  %xmm0, 72(%rdi)
> -L(bwd_write_72bytes):
> -       lddqu   56(%rsi), %xmm0
> -       movdqu  %xmm0, 56(%rdi)
> -L(bwd_write_56bytes):
> -       lddqu   40(%rsi), %xmm0
> -       movdqu  %xmm0, 40(%rdi)
> -L(bwd_write_40bytes):
> -       lddqu   24(%rsi), %xmm0
> -       movdqu  %xmm0, 24(%rdi)
> -L(bwd_write_24bytes):
> -       lddqu   8(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 8(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_8bytes):
> -       mov     (%rsi), %rdx
> -       mov     %rdx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_135bytes):
> -       lddqu   119(%rsi), %xmm0
> -       movdqu  %xmm0, 119(%rdi)
> -L(bwd_write_119bytes):
> -       lddqu   103(%rsi), %xmm0
> -       movdqu  %xmm0, 103(%rdi)
> -L(bwd_write_103bytes):
> -       lddqu   87(%rsi), %xmm0
> -       movdqu  %xmm0, 87(%rdi)
> -L(bwd_write_87bytes):
> -       lddqu   71(%rsi), %xmm0
> -       movdqu  %xmm0, 71(%rdi)
> -L(bwd_write_71bytes):
> -       lddqu   55(%rsi), %xmm0
> -       movdqu  %xmm0, 55(%rdi)
> -L(bwd_write_55bytes):
> -       lddqu   39(%rsi), %xmm0
> -       movdqu  %xmm0, 39(%rdi)
> -L(bwd_write_39bytes):
> -       lddqu   23(%rsi), %xmm0
> -       movdqu  %xmm0, 23(%rdi)
> -L(bwd_write_23bytes):
> -       lddqu   7(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 7(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_7bytes):
> -       mov     3(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 3(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_134bytes):
> -       lddqu   118(%rsi), %xmm0
> -       movdqu  %xmm0, 118(%rdi)
> -L(bwd_write_118bytes):
> -       lddqu   102(%rsi), %xmm0
> -       movdqu  %xmm0, 102(%rdi)
> -L(bwd_write_102bytes):
> -       lddqu   86(%rsi), %xmm0
> -       movdqu  %xmm0, 86(%rdi)
> -L(bwd_write_86bytes):
> -       lddqu   70(%rsi), %xmm0
> -       movdqu  %xmm0, 70(%rdi)
> -L(bwd_write_70bytes):
> -       lddqu   54(%rsi), %xmm0
> -       movdqu  %xmm0, 54(%rdi)
> -L(bwd_write_54bytes):
> -       lddqu   38(%rsi), %xmm0
> -       movdqu  %xmm0, 38(%rdi)
> -L(bwd_write_38bytes):
> -       lddqu   22(%rsi), %xmm0
> -       movdqu  %xmm0, 22(%rdi)
> -L(bwd_write_22bytes):
> -       lddqu   6(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 6(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_6bytes):
> -       mov     2(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 2(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_133bytes):
> -       lddqu   117(%rsi), %xmm0
> -       movdqu  %xmm0, 117(%rdi)
> -L(bwd_write_117bytes):
> -       lddqu   101(%rsi), %xmm0
> -       movdqu  %xmm0, 101(%rdi)
> -L(bwd_write_101bytes):
> -       lddqu   85(%rsi), %xmm0
> -       movdqu  %xmm0, 85(%rdi)
> -L(bwd_write_85bytes):
> -       lddqu   69(%rsi), %xmm0
> -       movdqu  %xmm0, 69(%rdi)
> -L(bwd_write_69bytes):
> -       lddqu   53(%rsi), %xmm0
> -       movdqu  %xmm0, 53(%rdi)
> -L(bwd_write_53bytes):
> -       lddqu   37(%rsi), %xmm0
> -       movdqu  %xmm0, 37(%rdi)
> -L(bwd_write_37bytes):
> -       lddqu   21(%rsi), %xmm0
> -       movdqu  %xmm0, 21(%rdi)
> -L(bwd_write_21bytes):
> -       lddqu   5(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 5(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_5bytes):
> -       mov     1(%rsi), %edx
> -       mov     (%rsi), %ecx
> -       mov     %edx, 1(%rdi)
> -       mov     %ecx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_132bytes):
> -       lddqu   116(%rsi), %xmm0
> -       movdqu  %xmm0, 116(%rdi)
> -L(bwd_write_116bytes):
> -       lddqu   100(%rsi), %xmm0
> -       movdqu  %xmm0, 100(%rdi)
> -L(bwd_write_100bytes):
> -       lddqu   84(%rsi), %xmm0
> -       movdqu  %xmm0, 84(%rdi)
> -L(bwd_write_84bytes):
> -       lddqu   68(%rsi), %xmm0
> -       movdqu  %xmm0, 68(%rdi)
> -L(bwd_write_68bytes):
> -       lddqu   52(%rsi), %xmm0
> -       movdqu  %xmm0, 52(%rdi)
> -L(bwd_write_52bytes):
> -       lddqu   36(%rsi), %xmm0
> -       movdqu  %xmm0, 36(%rdi)
> -L(bwd_write_36bytes):
> -       lddqu   20(%rsi), %xmm0
> -       movdqu  %xmm0, 20(%rdi)
> -L(bwd_write_20bytes):
> -       lddqu   4(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 4(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_4bytes):
> -       mov     (%rsi), %edx
> -       mov     %edx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_131bytes):
> -       lddqu   115(%rsi), %xmm0
> -       movdqu  %xmm0, 115(%rdi)
> -L(bwd_write_115bytes):
> -       lddqu   99(%rsi), %xmm0
> -       movdqu  %xmm0, 99(%rdi)
> -L(bwd_write_99bytes):
> -       lddqu   83(%rsi), %xmm0
> -       movdqu  %xmm0, 83(%rdi)
> -L(bwd_write_83bytes):
> -       lddqu   67(%rsi), %xmm0
> -       movdqu  %xmm0, 67(%rdi)
> -L(bwd_write_67bytes):
> -       lddqu   51(%rsi), %xmm0
> -       movdqu  %xmm0, 51(%rdi)
> -L(bwd_write_51bytes):
> -       lddqu   35(%rsi), %xmm0
> -       movdqu  %xmm0, 35(%rdi)
> -L(bwd_write_35bytes):
> -       lddqu   19(%rsi), %xmm0
> -       movdqu  %xmm0, 19(%rdi)
> -L(bwd_write_19bytes):
> -       lddqu   3(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 3(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_3bytes):
> -       mov     1(%rsi), %dx
> -       mov     (%rsi), %cx
> -       mov     %dx, 1(%rdi)
> -       mov     %cx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_130bytes):
> -       lddqu   114(%rsi), %xmm0
> -       movdqu  %xmm0, 114(%rdi)
> -L(bwd_write_114bytes):
> -       lddqu   98(%rsi), %xmm0
> -       movdqu  %xmm0, 98(%rdi)
> -L(bwd_write_98bytes):
> -       lddqu   82(%rsi), %xmm0
> -       movdqu  %xmm0, 82(%rdi)
> -L(bwd_write_82bytes):
> -       lddqu   66(%rsi), %xmm0
> -       movdqu  %xmm0, 66(%rdi)
> -L(bwd_write_66bytes):
> -       lddqu   50(%rsi), %xmm0
> -       movdqu  %xmm0, 50(%rdi)
> -L(bwd_write_50bytes):
> -       lddqu   34(%rsi), %xmm0
> -       movdqu  %xmm0, 34(%rdi)
> -L(bwd_write_34bytes):
> -       lddqu   18(%rsi), %xmm0
> -       movdqu  %xmm0, 18(%rdi)
> -L(bwd_write_18bytes):
> -       lddqu   2(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 2(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_2bytes):
> -       movzwl  (%rsi), %edx
> -       mov     %dx, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_129bytes):
> -       lddqu   113(%rsi), %xmm0
> -       movdqu  %xmm0, 113(%rdi)
> -L(bwd_write_113bytes):
> -       lddqu   97(%rsi), %xmm0
> -       movdqu  %xmm0, 97(%rdi)
> -L(bwd_write_97bytes):
> -       lddqu   81(%rsi), %xmm0
> -       movdqu  %xmm0, 81(%rdi)
> -L(bwd_write_81bytes):
> -       lddqu   65(%rsi), %xmm0
> -       movdqu  %xmm0, 65(%rdi)
> -L(bwd_write_65bytes):
> -       lddqu   49(%rsi), %xmm0
> -       movdqu  %xmm0, 49(%rdi)
> -L(bwd_write_49bytes):
> -       lddqu   33(%rsi), %xmm0
> -       movdqu  %xmm0, 33(%rdi)
> -L(bwd_write_33bytes):
> -       lddqu   17(%rsi), %xmm0
> -       movdqu  %xmm0, 17(%rdi)
> -L(bwd_write_17bytes):
> -       lddqu   1(%rsi), %xmm0
> -       lddqu   (%rsi), %xmm1
> -       movdqu  %xmm0, 1(%rdi)
> -       movdqu  %xmm1, (%rdi)
> -       ret
> -
> -       .p2align 4
> -L(bwd_write_1bytes):
> -       movzbl  (%rsi), %edx
> -       mov     %dl, (%rdi)
> -       ret
> -
> -END (MEMCPY)
> -
> -       .section .rodata.ssse3,"a",@progbits
> -       .p2align 3
> -L(table_144_bytes_bwd):
> -       .int    JMPTBL (L(bwd_write_0bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_1bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_2bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_3bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_4bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_5bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_6bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_7bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_8bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_9bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_10bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_11bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_12bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_13bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_14bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_15bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_16bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_17bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_18bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_19bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_20bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_21bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_22bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_23bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_24bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_25bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_26bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_27bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_28bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_29bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_30bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_31bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_32bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_33bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_34bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_35bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_36bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_37bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_38bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_39bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_40bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_41bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_42bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_43bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_44bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_45bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_46bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_47bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_48bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_49bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_50bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_51bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_52bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_53bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_54bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_55bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_56bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_57bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_58bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_59bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_60bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_61bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_62bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_63bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_64bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_65bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_66bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_67bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_68bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_69bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_70bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_71bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_72bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_73bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_74bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_75bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_76bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_77bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_78bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_79bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_80bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_81bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_82bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_83bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_84bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_85bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_86bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_87bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_88bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_89bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_90bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_91bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_92bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_93bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_94bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_95bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_96bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_97bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_98bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_99bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_100bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_101bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_102bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_103bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_104bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_105bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_106bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_107bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_108bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_109bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_110bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_111bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_112bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_113bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_114bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_115bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_116bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_117bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_118bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_119bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_120bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_121bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_122bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_123bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_124bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_125bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_126bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_127bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_128bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_129bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_130bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_131bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_132bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_133bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_134bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_135bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_136bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_137bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_138bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_139bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_140bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_141bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_142bytes), L(table_144_bytes_bwd))
> -       .int    JMPTBL (L(bwd_write_143bytes), L(table_144_bytes_bwd))
> -
> -       .p2align 3
> -L(table_144_bytes_fwd):
> -       .int    JMPTBL (L(fwd_write_0bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_1bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_2bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_3bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_4bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_5bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_6bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_7bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_8bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_9bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_10bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_11bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_12bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_13bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_14bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_15bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_16bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_17bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_18bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_19bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_20bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_21bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_22bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_23bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_24bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_25bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_26bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_27bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_28bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_29bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_30bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_31bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_32bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_33bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_34bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_35bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_36bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_37bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_38bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_39bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_40bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_41bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_42bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_43bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_44bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_45bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_46bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_47bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_48bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_49bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_50bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_51bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_52bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_53bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_54bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_55bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_56bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_57bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_58bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_59bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_60bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_61bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_62bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_63bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_64bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_65bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_66bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_67bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_68bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_69bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_70bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_71bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_72bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_73bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_74bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_75bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_76bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_77bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_78bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_79bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_80bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_81bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_82bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_83bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_84bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_85bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_86bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_87bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_88bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_89bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_90bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_91bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_92bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_93bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_94bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_95bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_96bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_97bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_98bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_99bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_100bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_101bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_102bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_103bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_104bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_105bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_106bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_107bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_108bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_109bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_110bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_111bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_112bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_113bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_114bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_115bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_116bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_117bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_118bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_119bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_120bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_121bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_122bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_123bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_124bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_125bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_126bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_127bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_128bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_129bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_130bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_131bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_132bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_133bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_134bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_135bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_136bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_137bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_138bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_139bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_140bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_141bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_142bytes), L(table_144_bytes_fwd))
> -       .int    JMPTBL (L(fwd_write_143bytes), L(table_144_bytes_fwd))
> -
> -       .p2align 3
> -L(shl_table_fwd):
> -       .int    JMPTBL (L(shl_0), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_1), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_2), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_3), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_4), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_5), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_6), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_7), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_8), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_9), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_10), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_11), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_12), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_13), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_14), L(shl_table_fwd))
> -       .int    JMPTBL (L(shl_15), L(shl_table_fwd))
> -
> -       .p2align 3
> -L(shl_table_bwd):
> -       .int    JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
> -       .int    JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
> -
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S b/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> deleted file mode 100644
> index f9a4e9aff9..0000000000
> --- a/sysdeps/x86_64/multiarch/memmove-ssse3-back.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_MEMMOVE
> -#define MEMCPY         __memmove_ssse3_back
> -#define MEMCPY_CHK     __memmove_chk_ssse3_back
> -#include "memcpy-ssse3-back.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 5/6] x86: Remove str{n}cat-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                   ` (2 preceding siblings ...)
  2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
  2022-03-25 19:57   ` H.J. Lu
  2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
                   ` (2 subsequent siblings)
  6 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |   2 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
 sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
 sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
 sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
 5 files changed, 879 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 323be3b969..a2ebc06c5f 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -59,7 +59,6 @@ sysdep_routines += \
   strcat-evex \
   strcat-sse2 \
   strcat-sse2-unaligned \
-  strcat-ssse3 \
   strchr-avx2 \
   strchr-avx2-rtm \
   strchr-evex \
@@ -97,7 +96,6 @@ sysdep_routines += \
   strncat-c \
   strncat-evex \
   strncat-sse2-unaligned \
-  strncat-ssse3 \
   strncmp-avx2 \
   strncmp-avx2-rtm \
   strncmp-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d6852ab365..4133ed7e43 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcat_evex)
-	      IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
-			      __strcat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
 
@@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncat_evex)
-	      IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
-			      __strncat_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
index 5bece38f78..a15afa44e9 100644
--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
+++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
@@ -23,7 +23,6 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
   attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
@@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
     return OPTIMIZE (sse2_unaligned);
 
-  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
-    return OPTIMIZE (ssse3);
-
   return OPTIMIZE (sse2);
 }
diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
deleted file mode 100644
index 9f39e4fcd1..0000000000
--- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
+++ /dev/null
@@ -1,866 +0,0 @@
-/* strcat with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef STRCAT
-#  define STRCAT  __strcat_ssse3
-# endif
-
-# define USE_AS_STRCAT
-
-.text
-ENTRY (STRCAT)
-# ifdef USE_AS_STRNCAT
-	mov	%rdx, %r8
-# endif
-
-
-/* Inline corresponding strlen file, temporary until new strcpy
-   implementation gets merged.  */
-
-	xor	%eax, %eax
-	cmpb	$0, (%rdi)
-	jz	L(exit_tail0)
-	cmpb	$0, 1(%rdi)
-	jz	L(exit_tail1)
-	cmpb	$0, 2(%rdi)
-	jz	L(exit_tail2)
-	cmpb	$0, 3(%rdi)
-	jz	L(exit_tail3)
-
-	cmpb	$0, 4(%rdi)
-	jz	L(exit_tail4)
-	cmpb	$0, 5(%rdi)
-	jz	L(exit_tail5)
-	cmpb	$0, 6(%rdi)
-	jz	L(exit_tail6)
-	cmpb	$0, 7(%rdi)
-	jz	L(exit_tail7)
-
-	cmpb	$0, 8(%rdi)
-	jz	L(exit_tail8)
-	cmpb	$0, 9(%rdi)
-	jz	L(exit_tail9)
-	cmpb	$0, 10(%rdi)
-	jz	L(exit_tail10)
-	cmpb	$0, 11(%rdi)
-	jz	L(exit_tail11)
-
-	cmpb	$0, 12(%rdi)
-	jz	L(exit_tail12)
-	cmpb	$0, 13(%rdi)
-	jz	L(exit_tail13)
-	cmpb	$0, 14(%rdi)
-	jz	L(exit_tail14)
-	cmpb	$0, 15(%rdi)
-	jz	L(exit_tail15)
-	pxor	%xmm0, %xmm0
-	lea	16(%rdi), %rcx
-	lea	16(%rdi), %rax
-	and	$-16, %rax
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	pxor	%xmm1, %xmm1
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	pxor	%xmm2, %xmm2
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	pxor	%xmm3, %xmm3
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm0
-	pmovmskb %xmm0, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm1
-	pmovmskb %xmm1, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm2
-	pmovmskb %xmm2, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	pcmpeqb	(%rax), %xmm3
-	pmovmskb %xmm3, %edx
-	test	%edx, %edx
-	lea	16(%rax), %rax
-	jnz	L(exit)
-
-	and	$-0x40, %rax
-
-	.p2align 4
-L(aligned_64):
-	pcmpeqb	(%rax), %xmm0
-	pcmpeqb	16(%rax), %xmm1
-	pcmpeqb	32(%rax), %xmm2
-	pcmpeqb	48(%rax), %xmm3
-	pmovmskb %xmm0, %edx
-	pmovmskb %xmm1, %r11d
-	pmovmskb %xmm2, %r10d
-	pmovmskb %xmm3, %r9d
-	or	%edx, %r9d
-	or	%r11d, %r9d
-	or	%r10d, %r9d
-	lea	64(%rax), %rax
-	jz	L(aligned_64)
-
-	test	%edx, %edx
-	jnz	L(aligned_64_exit_16)
-	test	%r11d, %r11d
-	jnz	L(aligned_64_exit_32)
-	test	%r10d, %r10d
-	jnz	L(aligned_64_exit_48)
-
-L(aligned_64_exit_64):
-	pmovmskb %xmm3, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_48):
-	lea	-16(%rax), %rax
-	mov	%r10d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_32):
-	lea	-32(%rax), %rax
-	mov	%r11d, %edx
-	jmp	L(exit)
-
-L(aligned_64_exit_16):
-	lea	-48(%rax), %rax
-
-L(exit):
-	sub	%rcx, %rax
-	test	%dl, %dl
-	jz	L(exit_high)
-	test	$0x01, %dl
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dl
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dl
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dl
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dl
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dl
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dl
-	jnz	L(exit_tail6)
-	add	$7, %eax
-L(exit_tail0):
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_high):
-	add	$8, %eax
-	test	$0x01, %dh
-	jnz	L(exit_tail0)
-
-	test	$0x02, %dh
-	jnz	L(exit_tail1)
-
-	test	$0x04, %dh
-	jnz	L(exit_tail2)
-
-	test	$0x08, %dh
-	jnz	L(exit_tail3)
-
-	test	$0x10, %dh
-	jnz	L(exit_tail4)
-
-	test	$0x20, %dh
-	jnz	L(exit_tail5)
-
-	test	$0x40, %dh
-	jnz	L(exit_tail6)
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail1):
-	add	$1, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail2):
-	add	$2, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail3):
-	add	$3, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail4):
-	add	$4, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail5):
-	add	$5, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail6):
-	add	$6, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail7):
-	add	$7, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail8):
-	add	$8, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail9):
-	add	$9, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail10):
-	add	$10, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail11):
-	add	$11, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail12):
-	add	$12, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail13):
-	add	$13, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail14):
-	add	$14, %eax
-	jmp	L(StartStrcpyPart)
-
-	.p2align 4
-L(exit_tail15):
-	add	$15, %eax
-
-	.p2align 4
-L(StartStrcpyPart):
-	mov	%rsi, %rcx
-	lea	(%rdi, %rax), %rdx
-# ifdef USE_AS_STRNCAT
-	test	%r8, %r8
-	jz	L(StrncatExit0)
-	cmp	$8, %r8
-	jbe	L(StrncatExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	jb	L(StrncatExit15Bytes)
-# endif
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# ifdef USE_AS_STRNCAT
-	cmp	$16, %r8
-	je	L(StrncatExit16)
-#  define USE_AS_STRNCPY
-# endif
-
-# include "strcpy-ssse3.S"
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit1):
-	xor	%ah, %ah
-	movb	%ah, 1(%rdx)
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit2):
-	xor	%ah, %ah
-	movb	%ah, 2(%rdx)
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit3):
-	xor	%ah, %ah
-	movb	%ah, 3(%rdx)
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit4):
-	xor	%ah, %ah
-	movb	%ah, 4(%rdx)
-L(Exit4):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit5):
-	xor	%ah, %ah
-	movb	%ah, 5(%rdx)
-L(Exit5):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit6):
-	xor	%ah, %ah
-	movb	%ah, 6(%rdx)
-L(Exit6):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit7):
-	xor	%ah, %ah
-	movb	%ah, 7(%rdx)
-L(Exit7):
-	mov	(%rcx), %eax
-	mov	%eax, (%rdx)
-	mov	3(%rcx), %eax
-	mov	%eax, 3(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8):
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-L(Exit8):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit9):
-	xor	%ah, %ah
-	movb	%ah, 9(%rdx)
-L(Exit9):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movb	8(%rcx), %al
-	movb	%al, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit10):
-	xor	%ah, %ah
-	movb	%ah, 10(%rdx)
-L(Exit10):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movw	8(%rcx), %ax
-	movw	%ax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit11):
-	xor	%ah, %ah
-	movb	%ah, 11(%rdx)
-L(Exit11):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit12):
-	xor	%ah, %ah
-	movb	%ah, 12(%rdx)
-L(Exit12):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit13):
-	xor	%ah, %ah
-	movb	%ah, 13(%rdx)
-L(Exit13):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	5(%rcx), %xmm1
-	movlpd	%xmm1, 5(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit14):
-	xor	%ah, %ah
-	movb	%ah, 14(%rdx)
-L(Exit14):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	6(%rcx), %xmm1
-	movlpd	%xmm1, 6(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15):
-	xor	%ah, %ah
-	movb	%ah, 15(%rdx)
-L(Exit15):
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit16):
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-L(Exit16):
-	movlpd	(%rcx), %xmm0
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm0, (%rdx)
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-# ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase2):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$8, %r8
-	ja	L(ExitHighCase3)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	xor	%ah, %ah
-	movb	%ah, 8(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(ExitHighCase3):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	cmp	$15, %r8
-	je	L(StrncatExit15)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	8(%rcx), %xmm1
-	movlpd	%xmm1, 8(%rdx)
-	xor	%ah, %ah
-	movb	%ah, 16(%rdx)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit0):
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit15Bytes):
-	cmp	$9, %r8
-	je	L(StrncatExit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$10, %r8
-	je	L(StrncatExit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$11, %r8
-	je	L(StrncatExit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$12, %r8
-	je	L(StrncatExit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$13, %r8
-	je	L(StrncatExit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmp	$14, %r8
-	je	L(StrncatExit14)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	movlpd	7(%rcx), %xmm1
-	movlpd	%xmm1, 7(%rdx)
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-	.p2align 4
-L(StrncatExit8Bytes):
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$1, %r8
-	je	L(StrncatExit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$2, %r8
-	je	L(StrncatExit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$3, %r8
-	je	L(StrncatExit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$4, %r8
-	je	L(StrncatExit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$5, %r8
-	je	L(StrncatExit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$6, %r8
-	je	L(StrncatExit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmp	$7, %r8
-	je	L(StrncatExit7)
-	movlpd	(%rcx), %xmm0
-	movlpd	%xmm0, (%rdx)
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-	xor	%cl, %cl
-	movb	%cl, (%rax)
-	mov	%rdi, %rax
-	ret
-
-# endif
-END (STRCAT)
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
deleted file mode 100644
index 6c45ff3ec7..0000000000
--- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCAT
-#define STRCAT __strncat_ssse3
-#include "strcat-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 5/6] x86: Remove str{n}cat-ssse3
  2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 19:57   ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |   2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
>  sysdeps/x86_64/multiarch/ifunc-strcpy.h    |   4 -
>  sysdeps/x86_64/multiarch/strcat-ssse3.S    | 866 ---------------------
>  sysdeps/x86_64/multiarch/strncat-ssse3.S   |   3 -
>  5 files changed, 879 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/strcat-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncat-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 323be3b969..a2ebc06c5f 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -59,7 +59,6 @@ sysdep_routines += \
>    strcat-evex \
>    strcat-sse2 \
>    strcat-sse2-unaligned \
> -  strcat-ssse3 \
>    strchr-avx2 \
>    strchr-avx2-rtm \
>    strchr-evex \
> @@ -97,7 +96,6 @@ sysdep_routines += \
>    strncat-c \
>    strncat-evex \
>    strncat-sse2-unaligned \
> -  strncat-ssse3 \
>    strncmp-avx2 \
>    strncmp-avx2-rtm \
>    strncmp-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index d6852ab365..4133ed7e43 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -471,8 +471,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcat_evex)
> -             IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcat_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2))
>
> @@ -620,8 +618,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncat_evex)
> -             IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncat_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncat, 1,
>                               __strncat_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncat, 1, __strncat_sse2))
> diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> index 5bece38f78..a15afa44e9 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h
> @@ -23,7 +23,6 @@
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned)
>    attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
>      return OPTIMIZE (sse2_unaligned);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/strcat-ssse3.S b/sysdeps/x86_64/multiarch/strcat-ssse3.S
> deleted file mode 100644
> index 9f39e4fcd1..0000000000
> --- a/sysdeps/x86_64/multiarch/strcat-ssse3.S
> +++ /dev/null
> @@ -1,866 +0,0 @@
> -/* strcat with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef STRCAT
> -#  define STRCAT  __strcat_ssse3
> -# endif
> -
> -# define USE_AS_STRCAT
> -
> -.text
> -ENTRY (STRCAT)
> -# ifdef USE_AS_STRNCAT
> -       mov     %rdx, %r8
> -# endif
> -
> -
> -/* Inline corresponding strlen file, temporary until new strcpy
> -   implementation gets merged.  */
> -
> -       xor     %eax, %eax
> -       cmpb    $0, (%rdi)
> -       jz      L(exit_tail0)
> -       cmpb    $0, 1(%rdi)
> -       jz      L(exit_tail1)
> -       cmpb    $0, 2(%rdi)
> -       jz      L(exit_tail2)
> -       cmpb    $0, 3(%rdi)
> -       jz      L(exit_tail3)
> -
> -       cmpb    $0, 4(%rdi)
> -       jz      L(exit_tail4)
> -       cmpb    $0, 5(%rdi)
> -       jz      L(exit_tail5)
> -       cmpb    $0, 6(%rdi)
> -       jz      L(exit_tail6)
> -       cmpb    $0, 7(%rdi)
> -       jz      L(exit_tail7)
> -
> -       cmpb    $0, 8(%rdi)
> -       jz      L(exit_tail8)
> -       cmpb    $0, 9(%rdi)
> -       jz      L(exit_tail9)
> -       cmpb    $0, 10(%rdi)
> -       jz      L(exit_tail10)
> -       cmpb    $0, 11(%rdi)
> -       jz      L(exit_tail11)
> -
> -       cmpb    $0, 12(%rdi)
> -       jz      L(exit_tail12)
> -       cmpb    $0, 13(%rdi)
> -       jz      L(exit_tail13)
> -       cmpb    $0, 14(%rdi)
> -       jz      L(exit_tail14)
> -       cmpb    $0, 15(%rdi)
> -       jz      L(exit_tail15)
> -       pxor    %xmm0, %xmm0
> -       lea     16(%rdi), %rcx
> -       lea     16(%rdi), %rax
> -       and     $-16, %rax
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       pxor    %xmm1, %xmm1
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       pxor    %xmm2, %xmm2
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       pxor    %xmm3, %xmm3
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm0
> -       pmovmskb %xmm0, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm1
> -       pmovmskb %xmm1, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm2
> -       pmovmskb %xmm2, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       pcmpeqb (%rax), %xmm3
> -       pmovmskb %xmm3, %edx
> -       test    %edx, %edx
> -       lea     16(%rax), %rax
> -       jnz     L(exit)
> -
> -       and     $-0x40, %rax
> -
> -       .p2align 4
> -L(aligned_64):
> -       pcmpeqb (%rax), %xmm0
> -       pcmpeqb 16(%rax), %xmm1
> -       pcmpeqb 32(%rax), %xmm2
> -       pcmpeqb 48(%rax), %xmm3
> -       pmovmskb %xmm0, %edx
> -       pmovmskb %xmm1, %r11d
> -       pmovmskb %xmm2, %r10d
> -       pmovmskb %xmm3, %r9d
> -       or      %edx, %r9d
> -       or      %r11d, %r9d
> -       or      %r10d, %r9d
> -       lea     64(%rax), %rax
> -       jz      L(aligned_64)
> -
> -       test    %edx, %edx
> -       jnz     L(aligned_64_exit_16)
> -       test    %r11d, %r11d
> -       jnz     L(aligned_64_exit_32)
> -       test    %r10d, %r10d
> -       jnz     L(aligned_64_exit_48)
> -
> -L(aligned_64_exit_64):
> -       pmovmskb %xmm3, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_48):
> -       lea     -16(%rax), %rax
> -       mov     %r10d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_32):
> -       lea     -32(%rax), %rax
> -       mov     %r11d, %edx
> -       jmp     L(exit)
> -
> -L(aligned_64_exit_16):
> -       lea     -48(%rax), %rax
> -
> -L(exit):
> -       sub     %rcx, %rax
> -       test    %dl, %dl
> -       jz      L(exit_high)
> -       test    $0x01, %dl
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dl
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dl
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dl
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dl
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dl
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dl
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -L(exit_tail0):
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_high):
> -       add     $8, %eax
> -       test    $0x01, %dh
> -       jnz     L(exit_tail0)
> -
> -       test    $0x02, %dh
> -       jnz     L(exit_tail1)
> -
> -       test    $0x04, %dh
> -       jnz     L(exit_tail2)
> -
> -       test    $0x08, %dh
> -       jnz     L(exit_tail3)
> -
> -       test    $0x10, %dh
> -       jnz     L(exit_tail4)
> -
> -       test    $0x20, %dh
> -       jnz     L(exit_tail5)
> -
> -       test    $0x40, %dh
> -       jnz     L(exit_tail6)
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail1):
> -       add     $1, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail2):
> -       add     $2, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail3):
> -       add     $3, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail4):
> -       add     $4, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail5):
> -       add     $5, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail6):
> -       add     $6, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail7):
> -       add     $7, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail8):
> -       add     $8, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail9):
> -       add     $9, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail10):
> -       add     $10, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail11):
> -       add     $11, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail12):
> -       add     $12, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail13):
> -       add     $13, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail14):
> -       add     $14, %eax
> -       jmp     L(StartStrcpyPart)
> -
> -       .p2align 4
> -L(exit_tail15):
> -       add     $15, %eax
> -
> -       .p2align 4
> -L(StartStrcpyPart):
> -       mov     %rsi, %rcx
> -       lea     (%rdi, %rax), %rdx
> -# ifdef USE_AS_STRNCAT
> -       test    %r8, %r8
> -       jz      L(StrncatExit0)
> -       cmp     $8, %r8
> -       jbe     L(StrncatExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       jb      L(StrncatExit15Bytes)
> -# endif
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# ifdef USE_AS_STRNCAT
> -       cmp     $16, %r8
> -       je      L(StrncatExit16)
> -#  define USE_AS_STRNCPY
> -# endif
> -
> -# include "strcpy-ssse3.S"
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit1):
> -       xor     %ah, %ah
> -       movb    %ah, 1(%rdx)
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit2):
> -       xor     %ah, %ah
> -       movb    %ah, 2(%rdx)
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit3):
> -       xor     %ah, %ah
> -       movb    %ah, 3(%rdx)
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit4):
> -       xor     %ah, %ah
> -       movb    %ah, 4(%rdx)
> -L(Exit4):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit5):
> -       xor     %ah, %ah
> -       movb    %ah, 5(%rdx)
> -L(Exit5):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit6):
> -       xor     %ah, %ah
> -       movb    %ah, 6(%rdx)
> -L(Exit6):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit7):
> -       xor     %ah, %ah
> -       movb    %ah, 7(%rdx)
> -L(Exit7):
> -       mov     (%rcx), %eax
> -       mov     %eax, (%rdx)
> -       mov     3(%rcx), %eax
> -       mov     %eax, 3(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8):
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -L(Exit8):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit9):
> -       xor     %ah, %ah
> -       movb    %ah, 9(%rdx)
> -L(Exit9):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movb    8(%rcx), %al
> -       movb    %al, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit10):
> -       xor     %ah, %ah
> -       movb    %ah, 10(%rdx)
> -L(Exit10):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movw    8(%rcx), %ax
> -       movw    %ax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit11):
> -       xor     %ah, %ah
> -       movb    %ah, 11(%rdx)
> -L(Exit11):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit12):
> -       xor     %ah, %ah
> -       movb    %ah, 12(%rdx)
> -L(Exit12):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit13):
> -       xor     %ah, %ah
> -       movb    %ah, 13(%rdx)
> -L(Exit13):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  5(%rcx), %xmm1
> -       movlpd  %xmm1, 5(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit14):
> -       xor     %ah, %ah
> -       movb    %ah, 14(%rdx)
> -L(Exit14):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  6(%rcx), %xmm1
> -       movlpd  %xmm1, 6(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15):
> -       xor     %ah, %ah
> -       movb    %ah, 15(%rdx)
> -L(Exit15):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit16):
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -L(Exit16):
> -       movlpd  (%rcx), %xmm0
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -# ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $8, %r8
> -       ja      L(ExitHighCase3)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 8(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(ExitHighCase3):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       cmp     $15, %r8
> -       je      L(StrncatExit15)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  8(%rcx), %xmm1
> -       movlpd  %xmm1, 8(%rdx)
> -       xor     %ah, %ah
> -       movb    %ah, 16(%rdx)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit0):
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(StrncatExit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $10, %r8
> -       je      L(StrncatExit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $11, %r8
> -       je      L(StrncatExit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $12, %r8
> -       je      L(StrncatExit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $13, %r8
> -       je      L(StrncatExit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmp     $14, %r8
> -       je      L(StrncatExit14)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       movlpd  7(%rcx), %xmm1
> -       movlpd  %xmm1, 7(%rdx)
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncatExit8Bytes):
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $1, %r8
> -       je      L(StrncatExit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $2, %r8
> -       je      L(StrncatExit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $3, %r8
> -       je      L(StrncatExit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $4, %r8
> -       je      L(StrncatExit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $5, %r8
> -       je      L(StrncatExit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $6, %r8
> -       je      L(StrncatExit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmp     $7, %r8
> -       je      L(StrncatExit7)
> -       movlpd  (%rcx), %xmm0
> -       movlpd  %xmm0, (%rdx)
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -       xor     %cl, %cl
> -       movb    %cl, (%rax)
> -       mov     %rdi, %rax
> -       ret
> -
> -# endif
> -END (STRCAT)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncat-ssse3.S b/sysdeps/x86_64/multiarch/strncat-ssse3.S
> deleted file mode 100644
> index 6c45ff3ec7..0000000000
> --- a/sysdeps/x86_64/multiarch/strncat-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCAT
> -#define STRCAT __strncat_ssse3
> -#include "strcat-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                   ` (3 preceding siblings ...)
  2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
@ 2022-03-25 18:36 ` Noah Goldstein
  2022-03-25 19:57   ` H.J. Lu
  2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
  2022-03-25 20:34 ` Andreas Schwab
  6 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 18:36 UTC (permalink / raw)
  To: libc-alpha

With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
SSSE3. As a result its no longer with the code size cost.
---
 sysdeps/x86_64/multiarch/Makefile          |    4 -
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
 sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
 sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
 sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
 sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
 6 files changed, 3572 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
 delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index a2ebc06c5f..292353bad7 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -42,13 +42,11 @@ sysdep_routines += \
   stpcpy-evex \
   stpcpy-sse2 \
   stpcpy-sse2-unaligned \
-  stpcpy-ssse3 \
   stpncpy-avx2 \
   stpncpy-avx2-rtm \
   stpncpy-c \
   stpncpy-evex \
   stpncpy-sse2-unaligned \
-  stpncpy-ssse3 \
   strcasecmp_l-avx2 \
   strcasecmp_l-avx2-rtm \
   strcasecmp_l-evex \
@@ -79,7 +77,6 @@ sysdep_routines += \
   strcpy-evex \
   strcpy-sse2 \
   strcpy-sse2-unaligned \
-  strcpy-ssse3 \
   strcspn-c \
   strcspn-sse2 \
   strlen-avx2 \
@@ -106,7 +103,6 @@ sysdep_routines += \
   strncpy-c \
   strncpy-evex \
   strncpy-sse2-unaligned \
-  strncpy-ssse3 \
   strnlen-avx2 \
   strnlen-avx2-rtm \
   strnlen-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 4133ed7e43..505b8002e0 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
   IFUNC_IMPL (i, name, stpncpy,
-	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpncpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
@@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
-	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __stpcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
 			      __stpcpy_avx2)
 	      IFUNC_IMPL_ADD (array, i, stpcpy,
@@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strcpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strcpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
 
@@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __strncpy_evex)
-	      IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
-			      __strncpy_ssse3)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			      __strncpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
deleted file mode 100644
index d971c2da38..0000000000
--- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STPCPY
-#define STRCPY __stpcpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
deleted file mode 100644
index 14ed16f6b5..0000000000
--- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
+++ /dev/null
@@ -1,4 +0,0 @@
-#define USE_AS_STPCPY
-#define USE_AS_STRNCPY
-#define STRCPY __stpncpy_ssse3
-#include "strcpy-ssse3.S"
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
deleted file mode 100644
index f617a535cf..0000000000
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ /dev/null
@@ -1,3550 +0,0 @@
-/* strcpy with SSSE3
-   Copyright (C) 2011-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#if IS_IN (libc)
-
-# ifndef USE_AS_STRCAT
-#  include <sysdep.h>
-
-#  ifndef STRCPY
-#   define STRCPY  __strcpy_ssse3
-#  endif
-
-	.section .text.ssse3,"ax",@progbits
-ENTRY (STRCPY)
-
-	mov	%rsi, %rcx
-#  ifdef USE_AS_STRNCPY
-	mov	%RDX_LP, %R8_LP
-#  endif
-	mov	%rdi, %rdx
-#  ifdef USE_AS_STRNCPY
-	test	%R8_LP, %R8_LP
-	jz	L(Exit0)
-	cmp	$8, %R8_LP
-	jbe	L(StrncpyExit8Bytes)
-# endif
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	cmpb	$0, 7(%rcx)
-	jz	L(Exit8)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	jb	L(StrncpyExit15Bytes)
-# endif
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	cmpb	$0, 14(%rcx)
-	jz	L(Exit15)
-# ifdef USE_AS_STRNCPY
-	cmp	$16, %r8
-	je	L(Exit16)
-# endif
-	cmpb	$0, 15(%rcx)
-	jz	L(Exit16)
-# endif
-
-# ifdef USE_AS_STRNCPY
-	mov	%rcx, %rsi
-	sub	$16, %r8
-	and	$0xf, %rsi
-
-/* add 16 bytes rcx_offset to r8 */
-
-	add	%rsi, %r8
-# endif
-	lea	16(%rcx), %rsi
-	and	$-16, %rsi
-	pxor	%xmm0, %xmm0
-	mov	(%rcx), %r9
-	mov	%r9, (%rdx)
-	pcmpeqb	(%rsi), %xmm0
-	mov	8(%rcx), %r9
-	mov	%r9, 8(%rdx)
-
-/* convert byte mask in xmm0 to bit mask */
-
-	pmovmskb %xmm0, %rax
-	sub	%rcx, %rsi
-
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	mov	%rdx, %rax
-	lea	16(%rdx), %rdx
-	and	$-16, %rdx
-	sub	%rdx, %rax
-
-# ifdef USE_AS_STRNCPY
-	add	%rax, %rsi
-	lea	-1(%rsi), %rsi
-	and	$1<<31, %esi
-	test	%rsi, %rsi
-	jnz	L(ContinueCopy)
-	lea	16(%r8), %r8
-
-L(ContinueCopy):
-# endif
-	sub	%rax, %rcx
-	mov	%rcx, %rax
-	and	$0xf, %rax
-	mov	$0, %rsi
-
-/* case: rcx_offset == rdx_offset */
-
-	jz	L(Align16Both)
-
-	cmp	$8, %rax
-	jae	L(ShlHigh8)
-	cmp	$1, %rax
-	je	L(Shl1)
-	cmp	$2, %rax
-	je	L(Shl2)
-	cmp	$3, %rax
-	je	L(Shl3)
-	cmp	$4, %rax
-	je	L(Shl4)
-	cmp	$5, %rax
-	je	L(Shl5)
-	cmp	$6, %rax
-	je	L(Shl6)
-	jmp	L(Shl7)
-
-L(ShlHigh8):
-	je	L(Shl8)
-	cmp	$9, %rax
-	je	L(Shl9)
-	cmp	$10, %rax
-	je	L(Shl10)
-	cmp	$11, %rax
-	je	L(Shl11)
-	cmp	$12, %rax
-	je	L(Shl12)
-	cmp	$13, %rax
-	je	L(Shl13)
-	cmp	$14, %rax
-	je	L(Shl14)
-	jmp	L(Shl15)
-
-L(Align16Both):
-	movaps	(%rcx), %xmm1
-	movaps	16(%rcx), %xmm2
-	movaps	%xmm1, (%rdx)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm4
-	movaps	%xmm3, (%rdx, %rsi)
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm1
-	movaps	%xmm4, (%rdx, %rsi)
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm2
-	movaps	%xmm1, (%rdx, %rsi)
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	16(%rcx, %rsi), %xmm3
-	movaps	%xmm2, (%rdx, %rsi)
-	pcmpeqb	%xmm3, %xmm0
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm3, (%rdx, %rsi)
-	mov	%rcx, %rax
-	lea	16(%rcx, %rsi), %rcx
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	lea	112(%r8, %rax), %r8
-# endif
-	mov	$-0x40, %rsi
-
-	.p2align 4
-L(Aligned64Loop):
-	movaps	(%rcx), %xmm2
-	movaps	%xmm2, %xmm4
-	movaps	16(%rcx), %xmm5
-	movaps	32(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	48(%rcx), %xmm7
-	pminub	%xmm5, %xmm2
-	pminub	%xmm7, %xmm3
-	pminub	%xmm2, %xmm3
-	pcmpeqb	%xmm0, %xmm3
-	pmovmskb %xmm3, %rax
-	lea	64(%rdx), %rdx
-	lea	64(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeaveCase2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Aligned64Leave)
-	movaps	%xmm4, -64(%rdx)
-	movaps	%xmm5, -48(%rdx)
-	movaps	%xmm6, -32(%rdx)
-	movaps	%xmm7, -16(%rdx)
-	jmp	L(Aligned64Loop)
-
-L(Aligned64Leave):
-# ifdef USE_AS_STRNCPY
-	lea	48(%r8), %r8
-# endif
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	test	%rax, %rax
-	lea	16(%rsi), %rsi
-	jnz	L(CopyFrom1To16Bytes)
-
-	movaps	%xmm6, -32(%rdx)
-	pcmpeqb	%xmm7, %xmm0
-# ifdef USE_AS_STRNCPY
-	lea	-16(%r8), %r8
-# endif
-	pmovmskb %xmm0, %rax
-	lea	16(%rsi), %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl1):
-	movaps	-1(%rcx), %xmm1
-	movaps	15(%rcx), %xmm2
-L(Shl1Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit1Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl1LoopExit)
-
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	31(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-15(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-1(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl1LoopStart):
-	movaps	15(%rcx), %xmm2
-	movaps	31(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	47(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	63(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$1, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$1, %xmm3, %xmm4
-	jnz	L(Shl1Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave1)
-# endif
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl1LoopStart)
-
-L(Shl1LoopExit):
-	movdqu	-1(%rcx), %xmm1
-	mov	$15, %rsi
-	movdqu	%xmm1, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl2):
-	movaps	-2(%rcx), %xmm1
-	movaps	14(%rcx), %xmm2
-L(Shl2Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit2Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl2LoopExit)
-
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	30(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-14(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-2(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl2LoopStart):
-	movaps	14(%rcx), %xmm2
-	movaps	30(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	46(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	62(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$2, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$2, %xmm3, %xmm4
-	jnz	L(Shl2Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave2)
-# endif
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl2LoopStart)
-
-L(Shl2LoopExit):
-	movdqu	-2(%rcx), %xmm1
-	mov	$14, %rsi
-	movdqu	%xmm1, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl3):
-	movaps	-3(%rcx), %xmm1
-	movaps	13(%rcx), %xmm2
-L(Shl3Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit3Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl3LoopExit)
-
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	29(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-13(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-3(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl3LoopStart):
-	movaps	13(%rcx), %xmm2
-	movaps	29(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	45(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	61(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$3, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$3, %xmm3, %xmm4
-	jnz	L(Shl3Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave3)
-# endif
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl3LoopStart)
-
-L(Shl3LoopExit):
-	movdqu	-3(%rcx), %xmm1
-	mov	$13, %rsi
-	movdqu	%xmm1, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl4):
-	movaps	-4(%rcx), %xmm1
-	movaps	12(%rcx), %xmm2
-L(Shl4Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit4Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl4LoopExit)
-
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	28(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-12(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-4(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl4LoopStart):
-	movaps	12(%rcx), %xmm2
-	movaps	28(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	44(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	60(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$4, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$4, %xmm3, %xmm4
-	jnz	L(Shl4Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave4)
-# endif
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl4LoopStart)
-
-L(Shl4LoopExit):
-	movdqu	-4(%rcx), %xmm1
-	mov	$12, %rsi
-	movdqu	%xmm1, -4(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl5):
-	movaps	-5(%rcx), %xmm1
-	movaps	11(%rcx), %xmm2
-L(Shl5Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit5Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl5LoopExit)
-
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	27(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-11(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-5(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl5LoopStart):
-	movaps	11(%rcx), %xmm2
-	movaps	27(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	43(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	59(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$5, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$5, %xmm3, %xmm4
-	jnz	L(Shl5Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave5)
-# endif
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl5LoopStart)
-
-L(Shl5LoopExit):
-	movdqu	-5(%rcx), %xmm1
-	mov	$11, %rsi
-	movdqu	%xmm1, -5(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl6):
-	movaps	-6(%rcx), %xmm1
-	movaps	10(%rcx), %xmm2
-L(Shl6Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit6Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl6LoopExit)
-
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	26(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-10(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-6(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl6LoopStart):
-	movaps	10(%rcx), %xmm2
-	movaps	26(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	42(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	58(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$6, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$6, %xmm3, %xmm4
-	jnz	L(Shl6Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave6)
-# endif
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl6LoopStart)
-
-L(Shl6LoopExit):
-	mov	(%rcx), %r9
-	mov	6(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 6(%rdx)
-	mov	$10, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl7):
-	movaps	-7(%rcx), %xmm1
-	movaps	9(%rcx), %xmm2
-L(Shl7Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit7Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl7LoopExit)
-
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	25(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-9(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-7(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl7LoopStart):
-	movaps	9(%rcx), %xmm2
-	movaps	25(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	41(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	57(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$7, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$7, %xmm3, %xmm4
-	jnz	L(Shl7Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave7)
-# endif
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl7LoopStart)
-
-L(Shl7LoopExit):
-	mov	(%rcx), %r9
-	mov	5(%rcx), %esi
-	mov	%r9, (%rdx)
-	mov	%esi, 5(%rdx)
-	mov	$9, %rsi
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl8):
-	movaps	-8(%rcx), %xmm1
-	movaps	8(%rcx), %xmm2
-L(Shl8Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit8Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl8LoopExit)
-
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	24(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-8(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-8(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl8LoopStart):
-	movaps	8(%rcx), %xmm2
-	movaps	24(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	40(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	56(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$8, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$8, %xmm3, %xmm4
-	jnz	L(Shl8Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave8)
-# endif
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl8LoopStart)
-
-L(Shl8LoopExit):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl9):
-	movaps	-9(%rcx), %xmm1
-	movaps	7(%rcx), %xmm2
-L(Shl9Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit9Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl9LoopExit)
-
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	23(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-7(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-9(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl9LoopStart):
-	movaps	7(%rcx), %xmm2
-	movaps	23(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	39(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	55(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$9, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$9, %xmm3, %xmm4
-	jnz	L(Shl9Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave9)
-# endif
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl9LoopStart)
-
-L(Shl9LoopExit):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl10):
-	movaps	-10(%rcx), %xmm1
-	movaps	6(%rcx), %xmm2
-L(Shl10Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit10Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl10LoopExit)
-
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	22(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-6(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-10(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl10LoopStart):
-	movaps	6(%rcx), %xmm2
-	movaps	22(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	38(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	54(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$10, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$10, %xmm3, %xmm4
-	jnz	L(Shl10Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave10)
-# endif
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl10LoopStart)
-
-L(Shl10LoopExit):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl11):
-	movaps	-11(%rcx), %xmm1
-	movaps	5(%rcx), %xmm2
-L(Shl11Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit11Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl11LoopExit)
-
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	21(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-5(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-11(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl11LoopStart):
-	movaps	5(%rcx), %xmm2
-	movaps	21(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	37(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	53(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$11, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$11, %xmm3, %xmm4
-	jnz	L(Shl11Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave11)
-# endif
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl11LoopStart)
-
-L(Shl11LoopExit):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl12):
-	movaps	-12(%rcx), %xmm1
-	movaps	4(%rcx), %xmm2
-L(Shl12Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit12Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl12LoopExit)
-
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	20(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-4(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-12(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl12LoopStart):
-	movaps	4(%rcx), %xmm2
-	movaps	20(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	36(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	52(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$12, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$12, %xmm3, %xmm4
-	jnz	L(Shl12Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave12)
-# endif
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl12LoopStart)
-
-L(Shl12LoopExit):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl13):
-	movaps	-13(%rcx), %xmm1
-	movaps	3(%rcx), %xmm2
-L(Shl13Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit13Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl13LoopExit)
-
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	19(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-3(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-13(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl13LoopStart):
-	movaps	3(%rcx), %xmm2
-	movaps	19(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	35(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	51(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$13, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$13, %xmm3, %xmm4
-	jnz	L(Shl13Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave13)
-# endif
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl13LoopStart)
-
-L(Shl13LoopExit):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl14):
-	movaps	-14(%rcx), %xmm1
-	movaps	2(%rcx), %xmm2
-L(Shl14Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit14Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl14LoopExit)
-
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	18(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-2(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-14(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl14LoopStart):
-	movaps	2(%rcx), %xmm2
-	movaps	18(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	34(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	50(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$14, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$14, %xmm3, %xmm4
-	jnz	L(Shl14Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave14)
-# endif
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl14LoopStart)
-
-L(Shl14LoopExit):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	jmp	L(CopyFrom1To16Bytes)
-
-	.p2align 4
-L(Shl15):
-	movaps	-15(%rcx), %xmm1
-	movaps	1(%rcx), %xmm2
-L(Shl15Start):
-	pcmpeqb	%xmm2, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm1
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-	movaps	%xmm2, %xmm3
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-
-	pcmpeqb	%xmm2, %xmm0
-	lea	16(%rdx), %rdx
-	pmovmskb %xmm0, %rax
-	lea	16(%rcx), %rcx
-# ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	jbe	L(StrncpyExit15Case2OrCase3)
-# endif
-	test	%rax, %rax
-	jnz	L(Shl15LoopExit)
-
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, (%rdx)
-	lea	17(%rcx), %rcx
-	lea	16(%rdx), %rdx
-
-	mov	%rcx, %rax
-	and	$-0x40, %rcx
-	sub	%rcx, %rax
-	lea	-1(%rcx), %rcx
-	sub	%rax, %rdx
-# ifdef USE_AS_STRNCPY
-	add	%rax, %r8
-# endif
-	movaps	-15(%rcx), %xmm1
-
-/* 64 bytes loop */
-	.p2align 4
-L(Shl15LoopStart):
-	movaps	1(%rcx), %xmm2
-	movaps	17(%rcx), %xmm3
-	movaps	%xmm3, %xmm6
-	movaps	33(%rcx), %xmm4
-	movaps	%xmm4, %xmm7
-	movaps	49(%rcx), %xmm5
-	pminub	%xmm2, %xmm6
-	pminub	%xmm5, %xmm7
-	pminub	%xmm6, %xmm7
-	pcmpeqb	%xmm0, %xmm7
-	pmovmskb %xmm7, %rax
-	movaps	%xmm5, %xmm7
-	palignr	$15, %xmm4, %xmm5
-	test	%rax, %rax
-	palignr	$15, %xmm3, %xmm4
-	jnz	L(Shl15Start)
-# ifdef USE_AS_STRNCPY
-	sub	$64, %r8
-	jbe	L(StrncpyLeave15)
-# endif
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rcx), %rcx
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm7, %xmm1
-	movaps	%xmm5, 48(%rdx)
-	movaps	%xmm4, 32(%rdx)
-	movaps	%xmm3, 16(%rdx)
-	movaps	%xmm2, (%rdx)
-	lea	64(%rdx), %rdx
-	jmp	L(Shl15LoopStart)
-
-L(Shl15LoopExit):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-# ifdef USE_AS_STRCAT
-	jmp	L(CopyFrom1To16Bytes)
-# endif
-
-# ifndef USE_AS_STRCAT
-
-	.p2align 4
-L(CopyFrom1To16Bytes):
-#  ifdef USE_AS_STRNCPY
-	add	$16, %r8
-#  endif
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	test	%al, %al
-	jz	L(ExitHigh)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	test	$0x40, %al
-	jnz	L(Exit7)
-
-	.p2align 4
-L(Exit8):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$8, %r8
-	lea	8(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(ExitHigh):
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	test	$0x08, %ah
-	jnz	L(Exit12)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-
-	.p2align 4
-L(Exit16):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %rax
-	mov	%rax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	15(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$16, %r8
-	lea	16(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-
-	.p2align 4
-L(CopyFrom1To16BytesCase2):
-	add	$16, %r8
-	add	%rsi, %rcx
-	lea	(%rsi, %rdx), %rsi
-	lea	-9(%r8), %rdx
-	and	$1<<7, %dh
-	or	%al, %dh
-	test	%dh, %dh
-	lea	(%rsi), %rdx
-	jz	L(ExitHighCase2)
-
-	cmp	$1, %r8
-	je	L(Exit1)
-	test	$0x01, %al
-	jnz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	test	$0x02, %al
-	jnz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	test	$0x04, %al
-	jnz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	test	$0x08, %al
-	jnz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	test	$0x10, %al
-	jnz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	test	$0x20, %al
-	jnz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	test	$0x40, %al
-	jnz	L(Exit7)
-	jmp	L(Exit8)
-
-	.p2align 4
-L(ExitHighCase2):
-	cmp	$9, %r8
-	je	L(Exit9)
-	test	$0x01, %ah
-	jnz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	test	$0x02, %ah
-	jnz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	test	$0x04, %ah
-	jnz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	test	$0x8, %ah
-	jnz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	test	$0x10, %ah
-	jnz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	test	$0x20, %ah
-	jnz	L(Exit14)
-	cmp	$15, %r8
-	je	L(Exit15)
-	test	$0x40, %ah
-	jnz	L(Exit15)
-	jmp	L(Exit16)
-
-L(CopyFrom1To16BytesCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-
-	.p2align 4
-L(CopyFrom1To16BytesCase3):
-	add	$16, %r8
-	add	%rsi, %rdx
-	add	%rsi, %rcx
-
-	cmp	$16, %r8
-	je	L(Exit16)
-	cmp	$8, %r8
-	je	L(Exit8)
-	jg	L(More8Case3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	jg	L(More4Case3)
-	cmp	$2, %r8
-	jl	L(Exit1)
-	je	L(Exit2)
-	jg	L(Exit3)
-L(More8Case3): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Exit12)
-	jl	L(Less12Case3)
-	cmp	$14, %r8
-	jl	L(Exit13)
-	je	L(Exit14)
-	jg	L(Exit15)
-L(More4Case3): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Exit5)
-	je	L(Exit6)
-	jg	L(Exit7)
-L(Less12Case3): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Exit9)
-	je	L(Exit10)
-	jg	L(Exit11)
-#  endif
-
-	.p2align 4
-L(Exit1):
-	movb	(%rcx), %al
-	movb	%al, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$1, %r8
-	lea	1(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit2):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	1(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$2, %r8
-	lea	2(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit3):
-	movw	(%rcx), %ax
-	movw	%ax, (%rdx)
-	movb	2(%rcx), %al
-	movb	%al, 2(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	2(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$3, %r8
-	lea	3(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit4):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	3(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$4, %r8
-	lea	4(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit5):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movb	4(%rcx), %al
-	movb	%al, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	4(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$5, %r8
-	lea	5(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit6):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movw	4(%rcx), %ax
-	movw	%ax, 4(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	5(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$6, %r8
-	lea	6(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit7):
-	movl	(%rcx), %eax
-	movl	%eax, (%rdx)
-	movl	3(%rcx), %eax
-	movl	%eax, 3(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	6(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$7, %r8
-	lea	7(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit9):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %eax
-	mov	%eax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	8(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$9, %r8
-	lea	9(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit10):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %eax
-	mov	%eax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	9(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$10, %r8
-	lea	10(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit11):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %eax
-	mov	%eax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	10(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$11, %r8
-	lea	11(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit12):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	8(%rcx), %eax
-	mov	%eax, 8(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	11(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$12, %r8
-	lea	12(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#  endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit13):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	5(%rcx), %rax
-	mov	%rax, 5(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	12(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$13, %r8
-	lea	13(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit14):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	6(%rcx), %rax
-	mov	%rax, 6(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	13(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$14, %r8
-	lea	14(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-	.p2align 4
-L(Exit15):
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#  ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-#  else
-	mov	%rdi, %rax
-#  endif
-#  ifdef USE_AS_STRNCPY
-	sub	$15, %r8
-	lea	15(%rdx), %rcx
-	jnz	L(StrncpyFillTailWithZero1)
-#   ifdef USE_AS_STPCPY
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   endif
-#  endif
-	ret
-
-#  ifdef USE_AS_STRNCPY
-	.p2align 4
-L(Fill0):
-	ret
-
-	.p2align 4
-L(Fill1):
-	movb	%dl, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill2):
-	movw	%dx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill3):
-	movw	%dx, (%rcx)
-	movb	%dl, 2(%rcx)
-	ret
-
-	.p2align 4
-L(Fill4):
-	movl	%edx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill5):
-	movl	%edx, (%rcx)
-	movb	%dl, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill6):
-	movl	%edx, (%rcx)
-	movw	%dx, 4(%rcx)
-	ret
-
-	.p2align 4
-L(Fill7):
-	movl	%edx, (%rcx)
-	movl	%edx, 3(%rcx)
-	ret
-
-	.p2align 4
-L(Fill8):
-	mov	%rdx, (%rcx)
-	ret
-
-	.p2align 4
-L(Fill9):
-	mov	%rdx, (%rcx)
-	movb	%dl, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill10):
-	mov	%rdx, (%rcx)
-	movw	%dx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill11):
-	mov	%rdx, (%rcx)
-	movl	%edx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill12):
-	mov	%rdx, (%rcx)
-	movl	%edx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(Fill13):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 5(%rcx)
-	ret
-
-	.p2align 4
-L(Fill14):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 6(%rcx)
-	ret
-
-	.p2align 4
-L(Fill15):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 7(%rcx)
-	ret
-
-	.p2align 4
-L(Fill16):
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-	ret
-
-	.p2align 4
-L(StrncpyFillExit1):
-	lea	16(%r8), %r8
-L(FillFrom1To16Bytes):
-	test	%r8, %r8
-	jz	L(Fill0)
-	cmp	$16, %r8
-	je	L(Fill16)
-	cmp	$8, %r8
-	je	L(Fill8)
-	jg	L(FillMore8)
-	cmp	$4, %r8
-	je	L(Fill4)
-	jg	L(FillMore4)
-	cmp	$2, %r8
-	jl	L(Fill1)
-	je	L(Fill2)
-	jg	L(Fill3)
-L(FillMore8): /* but less than 16 */
-	cmp	$12, %r8
-	je	L(Fill12)
-	jl	L(FillLess12)
-	cmp	$14, %r8
-	jl	L(Fill13)
-	je	L(Fill14)
-	jg	L(Fill15)
-L(FillMore4): /* but less than 8 */
-	cmp	$6, %r8
-	jl	L(Fill5)
-	je	L(Fill6)
-	jg	L(Fill7)
-L(FillLess12): /* but more than 8 */
-	cmp	$10, %r8
-	jl	L(Fill9)
-	je	L(Fill10)
-	jmp	L(Fill11)
-
-	.p2align 4
-L(StrncpyFillTailWithZero1):
-	xor	%rdx, %rdx
-	sub	$16, %r8
-	jbe	L(StrncpyFillExit1)
-
-	pxor	%xmm0, %xmm0
-	mov	%rdx, (%rcx)
-	mov	%rdx, 8(%rcx)
-
-	lea	16(%rcx), %rcx
-
-	mov	%rcx, %rdx
-	and	$0xf, %rdx
-	sub	%rdx, %rcx
-	add	%rdx, %r8
-	xor	%rdx, %rdx
-	sub	$64, %r8
-	jb	L(StrncpyFillLess64)
-
-L(StrncpyFillLoopMovdqa):
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	movdqa	%xmm0, 32(%rcx)
-	movdqa	%xmm0, 48(%rcx)
-	lea	64(%rcx), %rcx
-	sub	$64, %r8
-	jae	L(StrncpyFillLoopMovdqa)
-
-L(StrncpyFillLess64):
-	add	$32, %r8
-	jl	L(StrncpyFillLess32)
-	movdqa	%xmm0, (%rcx)
-	movdqa	%xmm0, 16(%rcx)
-	lea	32(%rcx), %rcx
-	sub	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-L(StrncpyFillLess32):
-	add	$16, %r8
-	jl	L(StrncpyFillExit1)
-	movdqa	%xmm0, (%rcx)
-	lea	16(%rcx), %rcx
-	jmp	L(FillFrom1To16Bytes)
-
-	.p2align 4
-L(Exit0):
-	mov	%rdx, %rax
-	ret
-
-	.p2align 4
-L(StrncpyExit15Bytes):
-	cmp	$9, %r8
-	je	L(Exit9)
-	cmpb	$0, 8(%rcx)
-	jz	L(Exit9)
-	cmp	$10, %r8
-	je	L(Exit10)
-	cmpb	$0, 9(%rcx)
-	jz	L(Exit10)
-	cmp	$11, %r8
-	je	L(Exit11)
-	cmpb	$0, 10(%rcx)
-	jz	L(Exit11)
-	cmp	$12, %r8
-	je	L(Exit12)
-	cmpb	$0, 11(%rcx)
-	jz	L(Exit12)
-	cmp	$13, %r8
-	je	L(Exit13)
-	cmpb	$0, 12(%rcx)
-	jz	L(Exit13)
-	cmp	$14, %r8
-	je	L(Exit14)
-	cmpb	$0, 13(%rcx)
-	jz	L(Exit14)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-	mov	7(%rcx), %rax
-	mov	%rax, 7(%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	14(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-	.p2align 4
-L(StrncpyExit8Bytes):
-	cmp	$1, %r8
-	je	L(Exit1)
-	cmpb	$0, (%rcx)
-	jz	L(Exit1)
-	cmp	$2, %r8
-	je	L(Exit2)
-	cmpb	$0, 1(%rcx)
-	jz	L(Exit2)
-	cmp	$3, %r8
-	je	L(Exit3)
-	cmpb	$0, 2(%rcx)
-	jz	L(Exit3)
-	cmp	$4, %r8
-	je	L(Exit4)
-	cmpb	$0, 3(%rcx)
-	jz	L(Exit4)
-	cmp	$5, %r8
-	je	L(Exit5)
-	cmpb	$0, 4(%rcx)
-	jz	L(Exit5)
-	cmp	$6, %r8
-	je	L(Exit6)
-	cmpb	$0, 5(%rcx)
-	jz	L(Exit6)
-	cmp	$7, %r8
-	je	L(Exit7)
-	cmpb	$0, 6(%rcx)
-	jz	L(Exit7)
-	mov	(%rcx), %rax
-	mov	%rax, (%rdx)
-#   ifdef USE_AS_STPCPY
-	lea	7(%rdx), %rax
-	cmpb	$1, (%rax)
-	sbb	$-1, %rax
-#   else
-	mov	%rdi, %rax
-#   endif
-	ret
-
-#  endif
-# endif
-
-# ifdef USE_AS_STRNCPY
-	.p2align 4
-L(StrncpyLeaveCase2OrCase3):
-	test	%rax, %rax
-	jnz	L(Aligned64LeaveCase2)
-
-L(Aligned64LeaveCase3):
-	lea	64(%r8), %r8
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase3)
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase3)
-
-L(Aligned64LeaveCase2):
-	pcmpeqb	%xmm4, %xmm0
-	pmovmskb %xmm0, %rax
-	add	$48, %r8
-	jle	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm5, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm4, -64(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm6, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm5, -48(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(CopyFrom1To16BytesCase2OrCase3)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16Bytes)
-
-	pcmpeqb	%xmm7, %xmm0
-	pmovmskb %xmm0, %rax
-	movaps	%xmm6, -32(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-	jmp	L(CopyFrom1To16BytesCase2)
-/*--------------------------------------------------*/
-	.p2align 4
-L(StrncpyExit1Case2OrCase3):
-	movdqu	-1(%rcx), %xmm0
-	movdqu	%xmm0, -1(%rdx)
-	mov	$15, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit2Case2OrCase3):
-	movdqu	-2(%rcx), %xmm0
-	movdqu	%xmm0, -2(%rdx)
-	mov	$14, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit3Case2OrCase3):
-	movdqu	-3(%rcx), %xmm0
-	movdqu	%xmm0, -3(%rdx)
-	mov	$13, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit4Case2OrCase3):
-	movdqu	-4(%rcx), %xmm0
-	movdqu	%xmm0, -4(%rdx)
-	mov	$12, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit5Case2OrCase3):
-	movdqu	-5(%rcx), %xmm0
-	movdqu	%xmm0, -5(%rdx)
-	mov	$11, %rsi
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit6Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	6(%rcx), %r9d
-	mov	%r9d, 6(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$10, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit7Case2OrCase3):
-	mov	(%rcx), %rsi
-	mov	5(%rcx), %r9d
-	mov	%r9d, 5(%rdx)
-	mov	%rsi, (%rdx)
-	test	%rax, %rax
-	mov	$9, %rsi
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit8Case2OrCase3):
-	mov	(%rcx), %r9
-	mov	$8, %rsi
-	mov	%r9, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit9Case2OrCase3):
-	mov	-1(%rcx), %r9
-	mov	$7, %rsi
-	mov	%r9, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit10Case2OrCase3):
-	mov	-2(%rcx), %r9
-	mov	$6, %rsi
-	mov	%r9, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit11Case2OrCase3):
-	mov	-3(%rcx), %r9
-	mov	$5, %rsi
-	mov	%r9, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit12Case2OrCase3):
-	mov	(%rcx), %r9d
-	mov	$4, %rsi
-	mov	%r9d, (%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit13Case2OrCase3):
-	mov	-1(%rcx), %r9d
-	mov	$3, %rsi
-	mov	%r9d, -1(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit14Case2OrCase3):
-	mov	-2(%rcx), %r9d
-	mov	$2, %rsi
-	mov	%r9d, -2(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyExit15Case2OrCase3):
-	mov	-3(%rcx), %r9d
-	mov	$1, %rsi
-	mov	%r9d, -3(%rdx)
-	test	%rax, %rax
-	jnz	L(CopyFrom1To16BytesCase2)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave1):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit1)
-	palignr	$1, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	31(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	palignr	$1, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit1)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit1):
-	lea	15(%rdx, %rsi), %rdx
-	lea	15(%rcx, %rsi), %rcx
-	mov	-15(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -15(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave2):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit2)
-	palignr	$2, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	30(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	palignr	$2, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit2)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit2):
-	lea	14(%rdx, %rsi), %rdx
-	lea	14(%rcx, %rsi), %rcx
-	mov	-14(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -14(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave3):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit3)
-	palignr	$3, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	29(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	palignr	$3, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit3)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit3):
-	lea	13(%rdx, %rsi), %rdx
-	lea	13(%rcx, %rsi), %rcx
-	mov	-13(%rcx), %rsi
-	mov	-8(%rcx), %rax
-	mov	%rsi, -13(%rdx)
-	mov	%rax, -8(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave4):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit4)
-	palignr	$4, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	28(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	palignr	$4, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit4)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit4):
-	lea	12(%rdx, %rsi), %rdx
-	lea	12(%rcx, %rsi), %rcx
-	mov	-12(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -12(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave5):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit5)
-	palignr	$5, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	27(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	palignr	$5, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit5)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit5):
-	lea	11(%rdx, %rsi), %rdx
-	lea	11(%rcx, %rsi), %rcx
-	mov	-11(%rcx), %rsi
-	mov	-4(%rcx), %eax
-	mov	%rsi, -11(%rdx)
-	mov	%eax, -4(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave6):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit6)
-	palignr	$6, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	26(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	palignr	$6, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit6)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit6):
-	lea	10(%rdx, %rsi), %rdx
-	lea	10(%rcx, %rsi), %rcx
-	mov	-10(%rcx), %rsi
-	movw	-2(%rcx), %ax
-	mov	%rsi, -10(%rdx)
-	movw	%ax, -2(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave7):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit7)
-	palignr	$7, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	25(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	palignr	$7, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit7)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit7):
-	lea	9(%rdx, %rsi), %rdx
-	lea	9(%rcx, %rsi), %rcx
-	mov	-9(%rcx), %rsi
-	movb	-1(%rcx), %ah
-	mov	%rsi, -9(%rdx)
-	movb	%ah, -1(%rdx)
-	xor	%rsi, %rsi
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave8):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit8)
-	palignr	$8, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	24(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	palignr	$8, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit8)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit8):
-	lea	8(%rdx, %rsi), %rdx
-	lea	8(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave9):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit9)
-	palignr	$9, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	23(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	palignr	$9, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit9)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit9):
-	lea	7(%rdx, %rsi), %rdx
-	lea	7(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave10):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit10)
-	palignr	$10, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	22(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	palignr	$10, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit10)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit10):
-	lea	6(%rdx, %rsi), %rdx
-	lea	6(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave11):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit11)
-	palignr	$11, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	21(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	palignr	$11, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit11)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit11):
-	lea	5(%rdx, %rsi), %rdx
-	lea	5(%rcx, %rsi), %rcx
-	mov	-8(%rcx), %rax
-	xor	%rsi, %rsi
-	mov	%rax, -8(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave12):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit12)
-	palignr	$12, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	20(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	palignr	$12, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit12)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit12):
-	lea	4(%rdx, %rsi), %rdx
-	lea	4(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave13):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit13)
-	palignr	$13, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	19(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	palignr	$13, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit13)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit13):
-	lea	3(%rdx, %rsi), %rdx
-	lea	3(%rcx, %rsi), %rcx
-	mov	-4(%rcx), %eax
-	xor	%rsi, %rsi
-	mov	%eax, -4(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave14):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit14)
-	palignr	$14, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	18(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	palignr	$14, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit14)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit14):
-	lea	2(%rdx, %rsi), %rdx
-	lea	2(%rcx, %rsi), %rcx
-	movw	-2(%rcx), %ax
-	xor	%rsi, %rsi
-	movw	%ax, -2(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-	.p2align 4
-L(StrncpyLeave15):
-	movaps	%xmm2, %xmm3
-	add	$48, %r8
-	jle	L(StrncpyExit15)
-	palignr	$15, %xmm1, %xmm2
-	movaps	%xmm2, (%rdx)
-	movaps	17(%rcx), %xmm2
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	palignr	$15, %xmm3, %xmm2
-	movaps	%xmm2, 16(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm4, 32(%rdx)
-	lea	16(%rsi), %rsi
-	sub	$16, %r8
-	jbe	L(StrncpyExit15)
-	movaps	%xmm5, 48(%rdx)
-	lea	16(%rsi), %rsi
-	lea	-16(%r8), %r8
-
-L(StrncpyExit15):
-	lea	1(%rdx, %rsi), %rdx
-	lea	1(%rcx, %rsi), %rcx
-	movb	-1(%rcx), %ah
-	xor	%rsi, %rsi
-	movb	%ah, -1(%rdx)
-	jmp	L(CopyFrom1To16BytesCase3)
-
-# endif
-# ifndef USE_AS_STRCAT
-END (STRCPY)
-# endif
-#endif
diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
deleted file mode 100644
index bf82ee447d..0000000000
--- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
+++ /dev/null
@@ -1,3 +0,0 @@
-#define USE_AS_STRNCPY
-#define STRCPY __strncpy_ssse3
-#include "strcpy-ssse3.S"
-- 
2.25.1


^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3
  2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-03-25 19:57   ` H.J. Lu
  0 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:57 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    4 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    8 -
>  sysdeps/x86_64/multiarch/stpcpy-ssse3.S    |    3 -
>  sysdeps/x86_64/multiarch/stpncpy-ssse3.S   |    4 -
>  sysdeps/x86_64/multiarch/strcpy-ssse3.S    | 3550 --------------------
>  sysdeps/x86_64/multiarch/strncpy-ssse3.S   |    3 -
>  6 files changed, 3572 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/stpcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/stpncpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strcpy-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/strncpy-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index a2ebc06c5f..292353bad7 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -42,13 +42,11 @@ sysdep_routines += \
>    stpcpy-evex \
>    stpcpy-sse2 \
>    stpcpy-sse2-unaligned \
> -  stpcpy-ssse3 \
>    stpncpy-avx2 \
>    stpncpy-avx2-rtm \
>    stpncpy-c \
>    stpncpy-evex \
>    stpncpy-sse2-unaligned \
> -  stpncpy-ssse3 \
>    strcasecmp_l-avx2 \
>    strcasecmp_l-avx2-rtm \
>    strcasecmp_l-evex \
> @@ -79,7 +77,6 @@ sysdep_routines += \
>    strcpy-evex \
>    strcpy-sse2 \
>    strcpy-sse2-unaligned \
> -  strcpy-ssse3 \
>    strcspn-c \
>    strcspn-sse2 \
>    strlen-avx2 \
> @@ -106,7 +103,6 @@ sysdep_routines += \
>    strncpy-c \
>    strncpy-evex \
>    strncpy-sse2-unaligned \
> -  strncpy-ssse3 \
>    strnlen-avx2 \
>    strnlen-avx2-rtm \
>    strnlen-evex \
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 4133ed7e43..505b8002e0 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -389,8 +389,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpncpy.c.  */
>    IFUNC_IMPL (i, name, stpncpy,
> -             IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpncpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpncpy,
> @@ -407,8 +405,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>
>    /* Support sysdeps/x86_64/multiarch/stpcpy.c.  */
>    IFUNC_IMPL (i, name, stpcpy,
> -             IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __stpcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2),
>                               __stpcpy_avx2)
>               IFUNC_IMPL_ADD (array, i, stpcpy,
> @@ -557,8 +553,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strcpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strcpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2))
>
> @@ -634,8 +628,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               (CPU_FEATURE_USABLE (AVX512VL)
>                                && CPU_FEATURE_USABLE (AVX512BW)),
>                               __strncpy_evex)
> -             IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3),
> -                             __strncpy_ssse3)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1,
>                               __strncpy_sse2_unaligned)
>               IFUNC_IMPL_ADD (array, i, strncpy, 1, __strncpy_sse2))
> diff --git a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S b/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> deleted file mode 100644
> index d971c2da38..0000000000
> --- a/sysdeps/x86_64/multiarch/stpcpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STPCPY
> -#define STRCPY __stpcpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S b/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> deleted file mode 100644
> index 14ed16f6b5..0000000000
> --- a/sysdeps/x86_64/multiarch/stpncpy-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_STPCPY
> -#define USE_AS_STRNCPY
> -#define STRCPY __stpncpy_ssse3
> -#include "strcpy-ssse3.S"
> diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> deleted file mode 100644
> index f617a535cf..0000000000
> --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
> +++ /dev/null
> @@ -1,3550 +0,0 @@
> -/* strcpy with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# ifndef USE_AS_STRCAT
> -#  include <sysdep.h>
> -
> -#  ifndef STRCPY
> -#   define STRCPY  __strcpy_ssse3
> -#  endif
> -
> -       .section .text.ssse3,"ax",@progbits
> -ENTRY (STRCPY)
> -
> -       mov     %rsi, %rcx
> -#  ifdef USE_AS_STRNCPY
> -       mov     %RDX_LP, %R8_LP
> -#  endif
> -       mov     %rdi, %rdx
> -#  ifdef USE_AS_STRNCPY
> -       test    %R8_LP, %R8_LP
> -       jz      L(Exit0)
> -       cmp     $8, %R8_LP
> -       jbe     L(StrncpyExit8Bytes)
> -# endif
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       cmpb    $0, 7(%rcx)
> -       jz      L(Exit8)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       jb      L(StrncpyExit15Bytes)
> -# endif
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       cmpb    $0, 14(%rcx)
> -       jz      L(Exit15)
> -# ifdef USE_AS_STRNCPY
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -# endif
> -       cmpb    $0, 15(%rcx)
> -       jz      L(Exit16)
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       mov     %rcx, %rsi
> -       sub     $16, %r8
> -       and     $0xf, %rsi
> -
> -/* add 16 bytes rcx_offset to r8 */
> -
> -       add     %rsi, %r8
> -# endif
> -       lea     16(%rcx), %rsi
> -       and     $-16, %rsi
> -       pxor    %xmm0, %xmm0
> -       mov     (%rcx), %r9
> -       mov     %r9, (%rdx)
> -       pcmpeqb (%rsi), %xmm0
> -       mov     8(%rcx), %r9
> -       mov     %r9, 8(%rdx)
> -
> -/* convert byte mask in xmm0 to bit mask */
> -
> -       pmovmskb %xmm0, %rax
> -       sub     %rcx, %rsi
> -
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       mov     %rdx, %rax
> -       lea     16(%rdx), %rdx
> -       and     $-16, %rdx
> -       sub     %rdx, %rax
> -
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %rsi
> -       lea     -1(%rsi), %rsi
> -       and     $1<<31, %esi
> -       test    %rsi, %rsi
> -       jnz     L(ContinueCopy)
> -       lea     16(%r8), %r8
> -
> -L(ContinueCopy):
> -# endif
> -       sub     %rax, %rcx
> -       mov     %rcx, %rax
> -       and     $0xf, %rax
> -       mov     $0, %rsi
> -
> -/* case: rcx_offset == rdx_offset */
> -
> -       jz      L(Align16Both)
> -
> -       cmp     $8, %rax
> -       jae     L(ShlHigh8)
> -       cmp     $1, %rax
> -       je      L(Shl1)
> -       cmp     $2, %rax
> -       je      L(Shl2)
> -       cmp     $3, %rax
> -       je      L(Shl3)
> -       cmp     $4, %rax
> -       je      L(Shl4)
> -       cmp     $5, %rax
> -       je      L(Shl5)
> -       cmp     $6, %rax
> -       je      L(Shl6)
> -       jmp     L(Shl7)
> -
> -L(ShlHigh8):
> -       je      L(Shl8)
> -       cmp     $9, %rax
> -       je      L(Shl9)
> -       cmp     $10, %rax
> -       je      L(Shl10)
> -       cmp     $11, %rax
> -       je      L(Shl11)
> -       cmp     $12, %rax
> -       je      L(Shl12)
> -       cmp     $13, %rax
> -       je      L(Shl13)
> -       cmp     $14, %rax
> -       je      L(Shl14)
> -       jmp     L(Shl15)
> -
> -L(Align16Both):
> -       movaps  (%rcx), %xmm1
> -       movaps  16(%rcx), %xmm2
> -       movaps  %xmm1, (%rdx)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm4
> -       movaps  %xmm3, (%rdx, %rsi)
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm1
> -       movaps  %xmm4, (%rdx, %rsi)
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm2
> -       movaps  %xmm1, (%rdx, %rsi)
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  16(%rcx, %rsi), %xmm3
> -       movaps  %xmm2, (%rdx, %rsi)
> -       pcmpeqb %xmm3, %xmm0
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm3, (%rdx, %rsi)
> -       mov     %rcx, %rax
> -       lea     16(%rcx, %rsi), %rcx
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       lea     112(%r8, %rax), %r8
> -# endif
> -       mov     $-0x40, %rsi
> -
> -       .p2align 4
> -L(Aligned64Loop):
> -       movaps  (%rcx), %xmm2
> -       movaps  %xmm2, %xmm4
> -       movaps  16(%rcx), %xmm5
> -       movaps  32(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  48(%rcx), %xmm7
> -       pminub  %xmm5, %xmm2
> -       pminub  %xmm7, %xmm3
> -       pminub  %xmm2, %xmm3
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %rax
> -       lea     64(%rdx), %rdx
> -       lea     64(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeaveCase2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Aligned64Leave)
> -       movaps  %xmm4, -64(%rdx)
> -       movaps  %xmm5, -48(%rdx)
> -       movaps  %xmm6, -32(%rdx)
> -       movaps  %xmm7, -16(%rdx)
> -       jmp     L(Aligned64Loop)
> -
> -L(Aligned64Leave):
> -# ifdef USE_AS_STRNCPY
> -       lea     48(%r8), %r8
> -# endif
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       test    %rax, %rax
> -       lea     16(%rsi), %rsi
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       movaps  %xmm6, -32(%rdx)
> -       pcmpeqb %xmm7, %xmm0
> -# ifdef USE_AS_STRNCPY
> -       lea     -16(%r8), %r8
> -# endif
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rsi), %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl1):
> -       movaps  -1(%rcx), %xmm1
> -       movaps  15(%rcx), %xmm2
> -L(Shl1Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl1LoopExit)
> -
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     31(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -15(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -1(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl1LoopStart):
> -       movaps  15(%rcx), %xmm2
> -       movaps  31(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  47(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  63(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $1, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $1, %xmm3, %xmm4
> -       jnz     L(Shl1Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave1)
> -# endif
> -       palignr $1, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl1LoopStart)
> -
> -L(Shl1LoopExit):
> -       movdqu  -1(%rcx), %xmm1
> -       mov     $15, %rsi
> -       movdqu  %xmm1, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl2):
> -       movaps  -2(%rcx), %xmm1
> -       movaps  14(%rcx), %xmm2
> -L(Shl2Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl2LoopExit)
> -
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     30(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -14(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -2(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl2LoopStart):
> -       movaps  14(%rcx), %xmm2
> -       movaps  30(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  46(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  62(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $2, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $2, %xmm3, %xmm4
> -       jnz     L(Shl2Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave2)
> -# endif
> -       palignr $2, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl2LoopStart)
> -
> -L(Shl2LoopExit):
> -       movdqu  -2(%rcx), %xmm1
> -       mov     $14, %rsi
> -       movdqu  %xmm1, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl3):
> -       movaps  -3(%rcx), %xmm1
> -       movaps  13(%rcx), %xmm2
> -L(Shl3Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl3LoopExit)
> -
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     29(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -13(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -3(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl3LoopStart):
> -       movaps  13(%rcx), %xmm2
> -       movaps  29(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  45(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  61(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $3, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $3, %xmm3, %xmm4
> -       jnz     L(Shl3Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave3)
> -# endif
> -       palignr $3, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl3LoopStart)
> -
> -L(Shl3LoopExit):
> -       movdqu  -3(%rcx), %xmm1
> -       mov     $13, %rsi
> -       movdqu  %xmm1, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl4):
> -       movaps  -4(%rcx), %xmm1
> -       movaps  12(%rcx), %xmm2
> -L(Shl4Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl4LoopExit)
> -
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     28(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -12(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -4(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl4LoopStart):
> -       movaps  12(%rcx), %xmm2
> -       movaps  28(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  44(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  60(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $4, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $4, %xmm3, %xmm4
> -       jnz     L(Shl4Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave4)
> -# endif
> -       palignr $4, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl4LoopStart)
> -
> -L(Shl4LoopExit):
> -       movdqu  -4(%rcx), %xmm1
> -       mov     $12, %rsi
> -       movdqu  %xmm1, -4(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl5):
> -       movaps  -5(%rcx), %xmm1
> -       movaps  11(%rcx), %xmm2
> -L(Shl5Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl5LoopExit)
> -
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     27(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -11(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -5(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl5LoopStart):
> -       movaps  11(%rcx), %xmm2
> -       movaps  27(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  43(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  59(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $5, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $5, %xmm3, %xmm4
> -       jnz     L(Shl5Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave5)
> -# endif
> -       palignr $5, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl5LoopStart)
> -
> -L(Shl5LoopExit):
> -       movdqu  -5(%rcx), %xmm1
> -       mov     $11, %rsi
> -       movdqu  %xmm1, -5(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl6):
> -       movaps  -6(%rcx), %xmm1
> -       movaps  10(%rcx), %xmm2
> -L(Shl6Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl6LoopExit)
> -
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     26(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -10(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -6(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl6LoopStart):
> -       movaps  10(%rcx), %xmm2
> -       movaps  26(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  42(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  58(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $6, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $6, %xmm3, %xmm4
> -       jnz     L(Shl6Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave6)
> -# endif
> -       palignr $6, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl6LoopStart)
> -
> -L(Shl6LoopExit):
> -       mov     (%rcx), %r9
> -       mov     6(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 6(%rdx)
> -       mov     $10, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl7):
> -       movaps  -7(%rcx), %xmm1
> -       movaps  9(%rcx), %xmm2
> -L(Shl7Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl7LoopExit)
> -
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     25(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -9(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -7(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl7LoopStart):
> -       movaps  9(%rcx), %xmm2
> -       movaps  25(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  41(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  57(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $7, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $7, %xmm3, %xmm4
> -       jnz     L(Shl7Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave7)
> -# endif
> -       palignr $7, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl7LoopStart)
> -
> -L(Shl7LoopExit):
> -       mov     (%rcx), %r9
> -       mov     5(%rcx), %esi
> -       mov     %r9, (%rdx)
> -       mov     %esi, 5(%rdx)
> -       mov     $9, %rsi
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl8):
> -       movaps  -8(%rcx), %xmm1
> -       movaps  8(%rcx), %xmm2
> -L(Shl8Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl8LoopExit)
> -
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     24(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -8(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -8(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl8LoopStart):
> -       movaps  8(%rcx), %xmm2
> -       movaps  24(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  40(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  56(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $8, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $8, %xmm3, %xmm4
> -       jnz     L(Shl8Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave8)
> -# endif
> -       palignr $8, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl8LoopStart)
> -
> -L(Shl8LoopExit):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl9):
> -       movaps  -9(%rcx), %xmm1
> -       movaps  7(%rcx), %xmm2
> -L(Shl9Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl9LoopExit)
> -
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     23(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -7(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -9(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl9LoopStart):
> -       movaps  7(%rcx), %xmm2
> -       movaps  23(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  39(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  55(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $9, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $9, %xmm3, %xmm4
> -       jnz     L(Shl9Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave9)
> -# endif
> -       palignr $9, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl9LoopStart)
> -
> -L(Shl9LoopExit):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl10):
> -       movaps  -10(%rcx), %xmm1
> -       movaps  6(%rcx), %xmm2
> -L(Shl10Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl10LoopExit)
> -
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     22(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -6(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -10(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl10LoopStart):
> -       movaps  6(%rcx), %xmm2
> -       movaps  22(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  38(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  54(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $10, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $10, %xmm3, %xmm4
> -       jnz     L(Shl10Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave10)
> -# endif
> -       palignr $10, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl10LoopStart)
> -
> -L(Shl10LoopExit):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl11):
> -       movaps  -11(%rcx), %xmm1
> -       movaps  5(%rcx), %xmm2
> -L(Shl11Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl11LoopExit)
> -
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     21(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -5(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -11(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl11LoopStart):
> -       movaps  5(%rcx), %xmm2
> -       movaps  21(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  37(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  53(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $11, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $11, %xmm3, %xmm4
> -       jnz     L(Shl11Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave11)
> -# endif
> -       palignr $11, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl11LoopStart)
> -
> -L(Shl11LoopExit):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl12):
> -       movaps  -12(%rcx), %xmm1
> -       movaps  4(%rcx), %xmm2
> -L(Shl12Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl12LoopExit)
> -
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     20(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -4(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -12(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl12LoopStart):
> -       movaps  4(%rcx), %xmm2
> -       movaps  20(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  36(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  52(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $12, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $12, %xmm3, %xmm4
> -       jnz     L(Shl12Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave12)
> -# endif
> -       palignr $12, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl12LoopStart)
> -
> -L(Shl12LoopExit):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl13):
> -       movaps  -13(%rcx), %xmm1
> -       movaps  3(%rcx), %xmm2
> -L(Shl13Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl13LoopExit)
> -
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     19(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -3(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -13(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl13LoopStart):
> -       movaps  3(%rcx), %xmm2
> -       movaps  19(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  35(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  51(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $13, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $13, %xmm3, %xmm4
> -       jnz     L(Shl13Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave13)
> -# endif
> -       palignr $13, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl13LoopStart)
> -
> -L(Shl13LoopExit):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl14):
> -       movaps  -14(%rcx), %xmm1
> -       movaps  2(%rcx), %xmm2
> -L(Shl14Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl14LoopExit)
> -
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     18(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -2(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -14(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl14LoopStart):
> -       movaps  2(%rcx), %xmm2
> -       movaps  18(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  34(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  50(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $14, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $14, %xmm3, %xmm4
> -       jnz     L(Shl14Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave14)
> -# endif
> -       palignr $14, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl14LoopStart)
> -
> -L(Shl14LoopExit):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       jmp     L(CopyFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Shl15):
> -       movaps  -15(%rcx), %xmm1
> -       movaps  1(%rcx), %xmm2
> -L(Shl15Start):
> -       pcmpeqb %xmm2, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm1
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -       movaps  %xmm2, %xmm3
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -
> -       pcmpeqb %xmm2, %xmm0
> -       lea     16(%rdx), %rdx
> -       pmovmskb %xmm0, %rax
> -       lea     16(%rcx), %rcx
> -# ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15Case2OrCase3)
> -# endif
> -       test    %rax, %rax
> -       jnz     L(Shl15LoopExit)
> -
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       lea     17(%rcx), %rcx
> -       lea     16(%rdx), %rdx
> -
> -       mov     %rcx, %rax
> -       and     $-0x40, %rcx
> -       sub     %rcx, %rax
> -       lea     -1(%rcx), %rcx
> -       sub     %rax, %rdx
> -# ifdef USE_AS_STRNCPY
> -       add     %rax, %r8
> -# endif
> -       movaps  -15(%rcx), %xmm1
> -
> -/* 64 bytes loop */
> -       .p2align 4
> -L(Shl15LoopStart):
> -       movaps  1(%rcx), %xmm2
> -       movaps  17(%rcx), %xmm3
> -       movaps  %xmm3, %xmm6
> -       movaps  33(%rcx), %xmm4
> -       movaps  %xmm4, %xmm7
> -       movaps  49(%rcx), %xmm5
> -       pminub  %xmm2, %xmm6
> -       pminub  %xmm5, %xmm7
> -       pminub  %xmm6, %xmm7
> -       pcmpeqb %xmm0, %xmm7
> -       pmovmskb %xmm7, %rax
> -       movaps  %xmm5, %xmm7
> -       palignr $15, %xmm4, %xmm5
> -       test    %rax, %rax
> -       palignr $15, %xmm3, %xmm4
> -       jnz     L(Shl15Start)
> -# ifdef USE_AS_STRNCPY
> -       sub     $64, %r8
> -       jbe     L(StrncpyLeave15)
> -# endif
> -       palignr $15, %xmm2, %xmm3
> -       lea     64(%rcx), %rcx
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm7, %xmm1
> -       movaps  %xmm5, 48(%rdx)
> -       movaps  %xmm4, 32(%rdx)
> -       movaps  %xmm3, 16(%rdx)
> -       movaps  %xmm2, (%rdx)
> -       lea     64(%rdx), %rdx
> -       jmp     L(Shl15LoopStart)
> -
> -L(Shl15LoopExit):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -# ifdef USE_AS_STRCAT
> -       jmp     L(CopyFrom1To16Bytes)
> -# endif
> -
> -# ifndef USE_AS_STRCAT
> -
> -       .p2align 4
> -L(CopyFrom1To16Bytes):
> -#  ifdef USE_AS_STRNCPY
> -       add     $16, %r8
> -#  endif
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       test    %al, %al
> -       jz      L(ExitHigh)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -
> -       .p2align 4
> -L(Exit8):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $8, %r8
> -       lea     8(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(ExitHigh):
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       test    $0x08, %ah
> -       jnz     L(Exit12)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -
> -       .p2align 4
> -L(Exit16):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %rax
> -       mov     %rax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     15(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $16, %r8
> -       lea     16(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase2):
> -       add     $16, %r8
> -       add     %rsi, %rcx
> -       lea     (%rsi, %rdx), %rsi
> -       lea     -9(%r8), %rdx
> -       and     $1<<7, %dh
> -       or      %al, %dh
> -       test    %dh, %dh
> -       lea     (%rsi), %rdx
> -       jz      L(ExitHighCase2)
> -
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       test    $0x01, %al
> -       jnz     L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       test    $0x02, %al
> -       jnz     L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       test    $0x04, %al
> -       jnz     L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       test    $0x08, %al
> -       jnz     L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       test    $0x10, %al
> -       jnz     L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       test    $0x20, %al
> -       jnz     L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       test    $0x40, %al
> -       jnz     L(Exit7)
> -       jmp     L(Exit8)
> -
> -       .p2align 4
> -L(ExitHighCase2):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       test    $0x01, %ah
> -       jnz     L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       test    $0x02, %ah
> -       jnz     L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       test    $0x04, %ah
> -       jnz     L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       test    $0x8, %ah
> -       jnz     L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       test    $0x10, %ah
> -       jnz     L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       test    $0x20, %ah
> -       jnz     L(Exit14)
> -       cmp     $15, %r8
> -       je      L(Exit15)
> -       test    $0x40, %ah
> -       jnz     L(Exit15)
> -       jmp     L(Exit16)
> -
> -L(CopyFrom1To16BytesCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -
> -       .p2align 4
> -L(CopyFrom1To16BytesCase3):
> -       add     $16, %r8
> -       add     %rsi, %rdx
> -       add     %rsi, %rcx
> -
> -       cmp     $16, %r8
> -       je      L(Exit16)
> -       cmp     $8, %r8
> -       je      L(Exit8)
> -       jg      L(More8Case3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       jg      L(More4Case3)
> -       cmp     $2, %r8
> -       jl      L(Exit1)
> -       je      L(Exit2)
> -       jg      L(Exit3)
> -L(More8Case3): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       jl      L(Less12Case3)
> -       cmp     $14, %r8
> -       jl      L(Exit13)
> -       je      L(Exit14)
> -       jg      L(Exit15)
> -L(More4Case3): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Exit5)
> -       je      L(Exit6)
> -       jg      L(Exit7)
> -L(Less12Case3): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Exit9)
> -       je      L(Exit10)
> -       jg      L(Exit11)
> -#  endif
> -
> -       .p2align 4
> -L(Exit1):
> -       movb    (%rcx), %al
> -       movb    %al, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     (%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $1, %r8
> -       lea     1(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit2):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     1(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $2, %r8
> -       lea     2(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit3):
> -       movw    (%rcx), %ax
> -       movw    %ax, (%rdx)
> -       movb    2(%rcx), %al
> -       movb    %al, 2(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     2(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $3, %r8
> -       lea     3(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit4):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     3(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $4, %r8
> -       lea     4(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit5):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movb    4(%rcx), %al
> -       movb    %al, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     4(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $5, %r8
> -       lea     5(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit6):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movw    4(%rcx), %ax
> -       movw    %ax, 4(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     5(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $6, %r8
> -       lea     6(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit7):
> -       movl    (%rcx), %eax
> -       movl    %eax, (%rdx)
> -       movl    3(%rcx), %eax
> -       movl    %eax, 3(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     6(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $7, %r8
> -       lea     7(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit9):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %eax
> -       mov     %eax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     8(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $9, %r8
> -       lea     9(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit10):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %eax
> -       mov     %eax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     9(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $10, %r8
> -       lea     10(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit11):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %eax
> -       mov     %eax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     10(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $11, %r8
> -       lea     11(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit12):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     8(%rcx), %eax
> -       mov     %eax, 8(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     11(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $12, %r8
> -       lea     12(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#  endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit13):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     5(%rcx), %rax
> -       mov     %rax, 5(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     12(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $13, %r8
> -       lea     13(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit14):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     6(%rcx), %rax
> -       mov     %rax, 6(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     13(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $14, %r8
> -       lea     14(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -       .p2align 4
> -L(Exit15):
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#  ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -#  else
> -       mov     %rdi, %rax
> -#  endif
> -#  ifdef USE_AS_STRNCPY
> -       sub     $15, %r8
> -       lea     15(%rdx), %rcx
> -       jnz     L(StrncpyFillTailWithZero1)
> -#   ifdef USE_AS_STPCPY
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   endif
> -#  endif
> -       ret
> -
> -#  ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(Fill0):
> -       ret
> -
> -       .p2align 4
> -L(Fill1):
> -       movb    %dl, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill2):
> -       movw    %dx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill3):
> -       movw    %dx, (%rcx)
> -       movb    %dl, 2(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill4):
> -       movl    %edx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill5):
> -       movl    %edx, (%rcx)
> -       movb    %dl, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill6):
> -       movl    %edx, (%rcx)
> -       movw    %dx, 4(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill7):
> -       movl    %edx, (%rcx)
> -       movl    %edx, 3(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill8):
> -       mov     %rdx, (%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill9):
> -       mov     %rdx, (%rcx)
> -       movb    %dl, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill10):
> -       mov     %rdx, (%rcx)
> -       movw    %dx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill11):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill12):
> -       mov     %rdx, (%rcx)
> -       movl    %edx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill13):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 5(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill14):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 6(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill15):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 7(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(Fill16):
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -       ret
> -
> -       .p2align 4
> -L(StrncpyFillExit1):
> -       lea     16(%r8), %r8
> -L(FillFrom1To16Bytes):
> -       test    %r8, %r8
> -       jz      L(Fill0)
> -       cmp     $16, %r8
> -       je      L(Fill16)
> -       cmp     $8, %r8
> -       je      L(Fill8)
> -       jg      L(FillMore8)
> -       cmp     $4, %r8
> -       je      L(Fill4)
> -       jg      L(FillMore4)
> -       cmp     $2, %r8
> -       jl      L(Fill1)
> -       je      L(Fill2)
> -       jg      L(Fill3)
> -L(FillMore8): /* but less than 16 */
> -       cmp     $12, %r8
> -       je      L(Fill12)
> -       jl      L(FillLess12)
> -       cmp     $14, %r8
> -       jl      L(Fill13)
> -       je      L(Fill14)
> -       jg      L(Fill15)
> -L(FillMore4): /* but less than 8 */
> -       cmp     $6, %r8
> -       jl      L(Fill5)
> -       je      L(Fill6)
> -       jg      L(Fill7)
> -L(FillLess12): /* but more than 8 */
> -       cmp     $10, %r8
> -       jl      L(Fill9)
> -       je      L(Fill10)
> -       jmp     L(Fill11)
> -
> -       .p2align 4
> -L(StrncpyFillTailWithZero1):
> -       xor     %rdx, %rdx
> -       sub     $16, %r8
> -       jbe     L(StrncpyFillExit1)
> -
> -       pxor    %xmm0, %xmm0
> -       mov     %rdx, (%rcx)
> -       mov     %rdx, 8(%rcx)
> -
> -       lea     16(%rcx), %rcx
> -
> -       mov     %rcx, %rdx
> -       and     $0xf, %rdx
> -       sub     %rdx, %rcx
> -       add     %rdx, %r8
> -       xor     %rdx, %rdx
> -       sub     $64, %r8
> -       jb      L(StrncpyFillLess64)
> -
> -L(StrncpyFillLoopMovdqa):
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       movdqa  %xmm0, 32(%rcx)
> -       movdqa  %xmm0, 48(%rcx)
> -       lea     64(%rcx), %rcx
> -       sub     $64, %r8
> -       jae     L(StrncpyFillLoopMovdqa)
> -
> -L(StrncpyFillLess64):
> -       add     $32, %r8
> -       jl      L(StrncpyFillLess32)
> -       movdqa  %xmm0, (%rcx)
> -       movdqa  %xmm0, 16(%rcx)
> -       lea     32(%rcx), %rcx
> -       sub     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -L(StrncpyFillLess32):
> -       add     $16, %r8
> -       jl      L(StrncpyFillExit1)
> -       movdqa  %xmm0, (%rcx)
> -       lea     16(%rcx), %rcx
> -       jmp     L(FillFrom1To16Bytes)
> -
> -       .p2align 4
> -L(Exit0):
> -       mov     %rdx, %rax
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit15Bytes):
> -       cmp     $9, %r8
> -       je      L(Exit9)
> -       cmpb    $0, 8(%rcx)
> -       jz      L(Exit9)
> -       cmp     $10, %r8
> -       je      L(Exit10)
> -       cmpb    $0, 9(%rcx)
> -       jz      L(Exit10)
> -       cmp     $11, %r8
> -       je      L(Exit11)
> -       cmpb    $0, 10(%rcx)
> -       jz      L(Exit11)
> -       cmp     $12, %r8
> -       je      L(Exit12)
> -       cmpb    $0, 11(%rcx)
> -       jz      L(Exit12)
> -       cmp     $13, %r8
> -       je      L(Exit13)
> -       cmpb    $0, 12(%rcx)
> -       jz      L(Exit13)
> -       cmp     $14, %r8
> -       je      L(Exit14)
> -       cmpb    $0, 13(%rcx)
> -       jz      L(Exit14)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -       mov     7(%rcx), %rax
> -       mov     %rax, 7(%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     14(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -       .p2align 4
> -L(StrncpyExit8Bytes):
> -       cmp     $1, %r8
> -       je      L(Exit1)
> -       cmpb    $0, (%rcx)
> -       jz      L(Exit1)
> -       cmp     $2, %r8
> -       je      L(Exit2)
> -       cmpb    $0, 1(%rcx)
> -       jz      L(Exit2)
> -       cmp     $3, %r8
> -       je      L(Exit3)
> -       cmpb    $0, 2(%rcx)
> -       jz      L(Exit3)
> -       cmp     $4, %r8
> -       je      L(Exit4)
> -       cmpb    $0, 3(%rcx)
> -       jz      L(Exit4)
> -       cmp     $5, %r8
> -       je      L(Exit5)
> -       cmpb    $0, 4(%rcx)
> -       jz      L(Exit5)
> -       cmp     $6, %r8
> -       je      L(Exit6)
> -       cmpb    $0, 5(%rcx)
> -       jz      L(Exit6)
> -       cmp     $7, %r8
> -       je      L(Exit7)
> -       cmpb    $0, 6(%rcx)
> -       jz      L(Exit7)
> -       mov     (%rcx), %rax
> -       mov     %rax, (%rdx)
> -#   ifdef USE_AS_STPCPY
> -       lea     7(%rdx), %rax
> -       cmpb    $1, (%rax)
> -       sbb     $-1, %rax
> -#   else
> -       mov     %rdi, %rax
> -#   endif
> -       ret
> -
> -#  endif
> -# endif
> -
> -# ifdef USE_AS_STRNCPY
> -       .p2align 4
> -L(StrncpyLeaveCase2OrCase3):
> -       test    %rax, %rax
> -       jnz     L(Aligned64LeaveCase2)
> -
> -L(Aligned64LeaveCase3):
> -       lea     64(%r8), %r8
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase3)
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -L(Aligned64LeaveCase2):
> -       pcmpeqb %xmm4, %xmm0
> -       pmovmskb %xmm0, %rax
> -       add     $48, %r8
> -       jle     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm5, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm4, -64(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm6, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm5, -48(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(CopyFrom1To16BytesCase2OrCase3)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16Bytes)
> -
> -       pcmpeqb %xmm7, %xmm0
> -       pmovmskb %xmm0, %rax
> -       movaps  %xmm6, -32(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -       jmp     L(CopyFrom1To16BytesCase2)
> -/*--------------------------------------------------*/
> -       .p2align 4
> -L(StrncpyExit1Case2OrCase3):
> -       movdqu  -1(%rcx), %xmm0
> -       movdqu  %xmm0, -1(%rdx)
> -       mov     $15, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit2Case2OrCase3):
> -       movdqu  -2(%rcx), %xmm0
> -       movdqu  %xmm0, -2(%rdx)
> -       mov     $14, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit3Case2OrCase3):
> -       movdqu  -3(%rcx), %xmm0
> -       movdqu  %xmm0, -3(%rdx)
> -       mov     $13, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit4Case2OrCase3):
> -       movdqu  -4(%rcx), %xmm0
> -       movdqu  %xmm0, -4(%rdx)
> -       mov     $12, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit5Case2OrCase3):
> -       movdqu  -5(%rcx), %xmm0
> -       movdqu  %xmm0, -5(%rdx)
> -       mov     $11, %rsi
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit6Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     6(%rcx), %r9d
> -       mov     %r9d, 6(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $10, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit7Case2OrCase3):
> -       mov     (%rcx), %rsi
> -       mov     5(%rcx), %r9d
> -       mov     %r9d, 5(%rdx)
> -       mov     %rsi, (%rdx)
> -       test    %rax, %rax
> -       mov     $9, %rsi
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit8Case2OrCase3):
> -       mov     (%rcx), %r9
> -       mov     $8, %rsi
> -       mov     %r9, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit9Case2OrCase3):
> -       mov     -1(%rcx), %r9
> -       mov     $7, %rsi
> -       mov     %r9, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit10Case2OrCase3):
> -       mov     -2(%rcx), %r9
> -       mov     $6, %rsi
> -       mov     %r9, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit11Case2OrCase3):
> -       mov     -3(%rcx), %r9
> -       mov     $5, %rsi
> -       mov     %r9, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit12Case2OrCase3):
> -       mov     (%rcx), %r9d
> -       mov     $4, %rsi
> -       mov     %r9d, (%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit13Case2OrCase3):
> -       mov     -1(%rcx), %r9d
> -       mov     $3, %rsi
> -       mov     %r9d, -1(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit14Case2OrCase3):
> -       mov     -2(%rcx), %r9d
> -       mov     $2, %rsi
> -       mov     %r9d, -2(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyExit15Case2OrCase3):
> -       mov     -3(%rcx), %r9d
> -       mov     $1, %rsi
> -       mov     %r9d, -3(%rdx)
> -       test    %rax, %rax
> -       jnz     L(CopyFrom1To16BytesCase2)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave1):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit1)
> -       palignr $1, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  31(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       palignr $1, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit1)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit1):
> -       lea     15(%rdx, %rsi), %rdx
> -       lea     15(%rcx, %rsi), %rcx
> -       mov     -15(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -15(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave2):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit2)
> -       palignr $2, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  30(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       palignr $2, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit2)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit2):
> -       lea     14(%rdx, %rsi), %rdx
> -       lea     14(%rcx, %rsi), %rcx
> -       mov     -14(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -14(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave3):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit3)
> -       palignr $3, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  29(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       palignr $3, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit3)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit3):
> -       lea     13(%rdx, %rsi), %rdx
> -       lea     13(%rcx, %rsi), %rcx
> -       mov     -13(%rcx), %rsi
> -       mov     -8(%rcx), %rax
> -       mov     %rsi, -13(%rdx)
> -       mov     %rax, -8(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave4):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit4)
> -       palignr $4, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  28(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       palignr $4, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit4)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit4):
> -       lea     12(%rdx, %rsi), %rdx
> -       lea     12(%rcx, %rsi), %rcx
> -       mov     -12(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -12(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave5):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit5)
> -       palignr $5, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  27(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       palignr $5, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit5)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit5):
> -       lea     11(%rdx, %rsi), %rdx
> -       lea     11(%rcx, %rsi), %rcx
> -       mov     -11(%rcx), %rsi
> -       mov     -4(%rcx), %eax
> -       mov     %rsi, -11(%rdx)
> -       mov     %eax, -4(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave6):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit6)
> -       palignr $6, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  26(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       palignr $6, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit6)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit6):
> -       lea     10(%rdx, %rsi), %rdx
> -       lea     10(%rcx, %rsi), %rcx
> -       mov     -10(%rcx), %rsi
> -       movw    -2(%rcx), %ax
> -       mov     %rsi, -10(%rdx)
> -       movw    %ax, -2(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave7):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit7)
> -       palignr $7, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  25(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       palignr $7, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit7)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit7):
> -       lea     9(%rdx, %rsi), %rdx
> -       lea     9(%rcx, %rsi), %rcx
> -       mov     -9(%rcx), %rsi
> -       movb    -1(%rcx), %ah
> -       mov     %rsi, -9(%rdx)
> -       movb    %ah, -1(%rdx)
> -       xor     %rsi, %rsi
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave8):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit8)
> -       palignr $8, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  24(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       palignr $8, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit8)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit8):
> -       lea     8(%rdx, %rsi), %rdx
> -       lea     8(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave9):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit9)
> -       palignr $9, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  23(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       palignr $9, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit9)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit9):
> -       lea     7(%rdx, %rsi), %rdx
> -       lea     7(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave10):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit10)
> -       palignr $10, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  22(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       palignr $10, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit10)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit10):
> -       lea     6(%rdx, %rsi), %rdx
> -       lea     6(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave11):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit11)
> -       palignr $11, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  21(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       palignr $11, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit11)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit11):
> -       lea     5(%rdx, %rsi), %rdx
> -       lea     5(%rcx, %rsi), %rcx
> -       mov     -8(%rcx), %rax
> -       xor     %rsi, %rsi
> -       mov     %rax, -8(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave12):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit12)
> -       palignr $12, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  20(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       palignr $12, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit12)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit12):
> -       lea     4(%rdx, %rsi), %rdx
> -       lea     4(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave13):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit13)
> -       palignr $13, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  19(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       palignr $13, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit13)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit13):
> -       lea     3(%rdx, %rsi), %rdx
> -       lea     3(%rcx, %rsi), %rcx
> -       mov     -4(%rcx), %eax
> -       xor     %rsi, %rsi
> -       mov     %eax, -4(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave14):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit14)
> -       palignr $14, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  18(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       palignr $14, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit14)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit14):
> -       lea     2(%rdx, %rsi), %rdx
> -       lea     2(%rcx, %rsi), %rcx
> -       movw    -2(%rcx), %ax
> -       xor     %rsi, %rsi
> -       movw    %ax, -2(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -       .p2align 4
> -L(StrncpyLeave15):
> -       movaps  %xmm2, %xmm3
> -       add     $48, %r8
> -       jle     L(StrncpyExit15)
> -       palignr $15, %xmm1, %xmm2
> -       movaps  %xmm2, (%rdx)
> -       movaps  17(%rcx), %xmm2
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       palignr $15, %xmm3, %xmm2
> -       movaps  %xmm2, 16(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm4, 32(%rdx)
> -       lea     16(%rsi), %rsi
> -       sub     $16, %r8
> -       jbe     L(StrncpyExit15)
> -       movaps  %xmm5, 48(%rdx)
> -       lea     16(%rsi), %rsi
> -       lea     -16(%r8), %r8
> -
> -L(StrncpyExit15):
> -       lea     1(%rdx, %rsi), %rdx
> -       lea     1(%rcx, %rsi), %rcx
> -       movb    -1(%rcx), %ah
> -       xor     %rsi, %rsi
> -       movb    %ah, -1(%rdx)
> -       jmp     L(CopyFrom1To16BytesCase3)
> -
> -# endif
> -# ifndef USE_AS_STRCAT
> -END (STRCPY)
> -# endif
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/strncpy-ssse3.S b/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> deleted file mode 100644
> index bf82ee447d..0000000000
> --- a/sysdeps/x86_64/multiarch/strncpy-ssse3.S
> +++ /dev/null
> @@ -1,3 +0,0 @@
> -#define USE_AS_STRNCPY
> -#define STRCPY __strncpy_ssse3
> -#include "strcpy-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                   ` (4 preceding siblings ...)
  2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
@ 2022-03-25 19:54 ` H.J. Lu
  2022-03-25 20:34 ` Andreas Schwab
  6 siblings, 0 replies; 56+ messages in thread
From: H.J. Lu @ 2022-03-25 19:54 UTC (permalink / raw)
  To: Noah Goldstein; +Cc: libc-alpha, carlos

On Fri, Mar 25, 2022 at 11:36 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
>  sysdeps/x86_64/multiarch/Makefile          |    2 -
>  sysdeps/x86_64/multiarch/ifunc-impl-list.c |    4 -
>  sysdeps/x86_64/multiarch/ifunc-memcmp.h    |    4 -
>  sysdeps/x86_64/multiarch/memcmp-ssse3.S    | 1992 --------------------
>  sysdeps/x86_64/multiarch/wmemcmp-ssse3.S   |    4 -
>  5 files changed, 2006 deletions(-)
>  delete mode 100644 sysdeps/x86_64/multiarch/memcmp-ssse3.S
>  delete mode 100644 sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
>
> diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
> index 6507d1b7fa..51222dfab1 100644
> --- a/sysdeps/x86_64/multiarch/Makefile
> +++ b/sysdeps/x86_64/multiarch/Makefile
> @@ -12,7 +12,6 @@ sysdep_routines += \
>    memcmp-evex-movbe \
>    memcmp-sse2 \
>    memcmp-sse4 \
> -  memcmp-ssse3 \
>    memcmpeq-avx2 \
>    memcmpeq-avx2-rtm \
>    memcmpeq-evex \
> @@ -179,7 +178,6 @@ sysdep_routines += \
>    wmemcmp-c \
>    wmemcmp-evex-movbe \
>    wmemcmp-sse4 \
> -  wmemcmp-ssse3 \
>  # sysdep_routines
>  endif
>
> diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> index 40cc6cc49e..f389928a4e 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
> @@ -98,8 +98,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __memcmp_evex_movbe)
>               IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
>                               __memcmp_sse4_1)
> -             IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __memcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
>
>  #ifdef SHARED
> @@ -844,8 +842,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
>                               __wmemcmp_evex_movbe)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
>                               __wmemcmp_sse4_1)
> -             IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
> -                             __wmemcmp_ssse3)
>               IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
>
>    /* Support sysdeps/x86_64/multiarch/wmemset.c.  */
> diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> index cd12613699..44759a3ad5 100644
> --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
> @@ -20,7 +20,6 @@
>  # include <init-arch.h>
>
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
> -extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
>  extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
> @@ -50,8 +49,5 @@ IFUNC_SELECTOR (void)
>    if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
>      return OPTIMIZE (sse4_1);
>
> -  if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
> -    return OPTIMIZE (ssse3);
> -
>    return OPTIMIZE (sse2);
>  }
> diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> deleted file mode 100644
> index df1b1fc494..0000000000
> --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
> +++ /dev/null
> @@ -1,1992 +0,0 @@
> -/* memcmp with SSSE3, wmemcmp with SSSE3
> -   Copyright (C) 2011-2022 Free Software Foundation, Inc.
> -   This file is part of the GNU C Library.
> -
> -   The GNU C Library is free software; you can redistribute it and/or
> -   modify it under the terms of the GNU Lesser General Public
> -   License as published by the Free Software Foundation; either
> -   version 2.1 of the License, or (at your option) any later version.
> -
> -   The GNU C Library is distributed in the hope that it will be useful,
> -   but WITHOUT ANY WARRANTY; without even the implied warranty of
> -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
> -   Lesser General Public License for more details.
> -
> -   You should have received a copy of the GNU Lesser General Public
> -   License along with the GNU C Library; if not, see
> -   <https://www.gnu.org/licenses/>.  */
> -
> -#if IS_IN (libc)
> -
> -# include <sysdep.h>
> -
> -# ifndef MEMCMP
> -#  define MEMCMP       __memcmp_ssse3
> -# endif
> -
> -/* Warning!
> -          wmemcmp has to use SIGNED comparison for elements.
> -          memcmp has to use UNSIGNED comparison for elemnts.
> -*/
> -
> -       atom_text_section
> -ENTRY (MEMCMP)
> -# ifdef USE_AS_WMEMCMP
> -       shl     $2, %RDX_LP
> -       test    %RDX_LP, %RDX_LP
> -       jz      L(equal)
> -# elif defined __ILP32__
> -       /* Clear the upper 32 bits.  */
> -       mov     %edx, %edx
> -# endif
> -       mov     %rdx, %rcx
> -       mov     %rdi, %rdx
> -       cmp     $48, %rcx;
> -       jae     L(48bytesormore)        /* LEN => 48  */
> -
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -/* ECX >= 32.  */
> -L(48bytesormore):
> -       movdqu  (%rdi), %xmm3
> -       movdqu  (%rsi), %xmm0
> -       pcmpeqb %xmm0, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     16(%rdi), %rdi
> -       lea     16(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(less16bytes)
> -       mov     %edi, %edx
> -       and     $0xf, %edx
> -       xor     %rdx, %rdi
> -       sub     %rdx, %rsi
> -       add     %rdx, %rcx
> -       mov     %esi, %edx
> -       and     $0xf, %edx
> -       jz      L(shr_0)
> -       xor     %rdx, %rsi
> -
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $8, %edx
> -       jae     L(next_unaligned_table)
> -       cmp     $0, %edx
> -       je      L(shr_0)
> -       cmp     $1, %edx
> -       je      L(shr_1)
> -       cmp     $2, %edx
> -       je      L(shr_2)
> -       cmp     $3, %edx
> -       je      L(shr_3)
> -       cmp     $4, %edx
> -       je      L(shr_4)
> -       cmp     $5, %edx
> -       je      L(shr_5)
> -       cmp     $6, %edx
> -       je      L(shr_6)
> -       jmp     L(shr_7)
> -
> -       .p2align 2
> -L(next_unaligned_table):
> -       cmp     $8, %edx
> -       je      L(shr_8)
> -       cmp     $9, %edx
> -       je      L(shr_9)
> -       cmp     $10, %edx
> -       je      L(shr_10)
> -       cmp     $11, %edx
> -       je      L(shr_11)
> -       cmp     $12, %edx
> -       je      L(shr_12)
> -       cmp     $13, %edx
> -       je      L(shr_13)
> -       cmp     $14, %edx
> -       je      L(shr_14)
> -       jmp     L(shr_15)
> -# else
> -       cmp     $0, %edx
> -       je      L(shr_0)
> -       cmp     $4, %edx
> -       je      L(shr_4)
> -       cmp     $8, %edx
> -       je      L(shr_8)
> -       jmp     L(shr_12)
> -# endif
> -
> -       .p2align 4
> -L(shr_0):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       jae     L(shr_0_gobble)
> -       xor     %eax, %eax
> -       movdqa  (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -       movdqa  16(%rsi), %xmm2
> -       pcmpeqb 16(%rdi), %xmm2
> -       pand    %xmm1, %xmm2
> -       pmovmskb %xmm2, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_0_gobble):
> -       movdqa  (%rsi), %xmm0
> -       xor     %eax, %eax
> -       pcmpeqb (%rdi), %xmm0
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm2
> -       pcmpeqb 16(%rdi), %xmm2
> -L(shr_0_gobble_loop):
> -       pand    %xmm0, %xmm2
> -       sub     $32, %rcx
> -       pmovmskb %xmm2, %edx
> -       movdqa  %xmm0, %xmm1
> -       movdqa  32(%rsi), %xmm0
> -       movdqa  48(%rsi), %xmm2
> -       sbb     $0xffff, %edx
> -       pcmpeqb 32(%rdi), %xmm0
> -       pcmpeqb 48(%rdi), %xmm2
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       jz      L(shr_0_gobble_loop)
> -
> -       pand    %xmm0, %xmm2
> -       cmp     $0, %rcx
> -       jge     L(next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm2, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_1):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_1_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $1, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $1, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $1, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_1_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $1, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $1, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_1_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $1, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $1, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_1_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_1_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_1_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     1(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -
> -       .p2align 4
> -L(shr_2):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_2_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $2, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $2, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $2, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_2_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $2, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $2, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_2_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $2, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $2, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_2_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_2_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_2_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     2(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_3):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_3_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $3, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $3, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $3, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_3_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $3, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $3, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_3_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $3, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $3, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_3_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_3_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_3_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     3(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_4):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_4_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $4, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $4, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $4, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_4_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $4, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $4, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_4_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $4, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $4, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_4_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_4_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_4_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     4(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_5):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_5_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $5, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $5, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $5, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_5_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $5, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $5, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_5_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $5, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $5, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_5_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_5_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_5_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     5(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_6):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_6_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $6, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $6, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $6, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_6_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $6, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $6, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_6_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $6, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $6, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_6_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_6_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_6_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     6(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_7):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_7_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $7, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $7, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $7, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_7_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $7, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $7, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_7_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $7, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $7, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_7_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_7_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_7_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     7(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_8):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_8_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $8, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $8, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $8, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_8_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $8, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $8, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_8_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $8, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $8, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_8_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_8_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_8_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     8(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_9):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_9_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $9, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $9, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $9, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_9_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $9, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $9, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_9_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $9, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $9, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_9_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_9_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_9_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     9(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_10):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_10_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $10, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $10, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $10, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_10_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $10, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $10, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_10_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $10, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $10, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_10_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_10_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_10_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     10(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_11):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_11_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $11, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $11, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $11, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_11_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $11, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $11, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_11_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $11, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $11, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_11_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_11_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_11_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     11(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# endif
> -
> -       .p2align 4
> -L(shr_12):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_12_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $12, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $12, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $12, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_12_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $12, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $12, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_12_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $12, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $12, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_12_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_12_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_12_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     12(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -# ifndef USE_AS_WMEMCMP
> -
> -       .p2align 4
> -L(shr_13):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_13_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $13, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $13, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $13, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_13_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $13, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $13, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_13_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $13, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $13, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_13_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_13_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_13_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     13(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_14):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_14_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $14, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $14, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $14, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_14_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $14, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $14, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_14_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $14, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $14, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_14_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_14_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_14_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     14(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_15):
> -       cmp     $80, %rcx
> -       lea     -48(%rcx), %rcx
> -       mov     %edx, %eax
> -       jae     L(shr_15_gobble)
> -
> -       movdqa  16(%rsi), %xmm1
> -       movdqa  %xmm1, %xmm2
> -       palignr $15, (%rsi), %xmm1
> -       pcmpeqb (%rdi), %xmm1
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $15, %xmm2, %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -       pand    %xmm1, %xmm3
> -       pmovmskb %xmm3, %edx
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -       add     $15, %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -
> -       .p2align 4
> -L(shr_15_gobble):
> -       sub     $32, %rcx
> -       movdqa  16(%rsi), %xmm0
> -       palignr $15, (%rsi), %xmm0
> -       pcmpeqb (%rdi), %xmm0
> -
> -       movdqa  32(%rsi), %xmm3
> -       palignr $15, 16(%rsi), %xmm3
> -       pcmpeqb 16(%rdi), %xmm3
> -
> -L(shr_15_gobble_loop):
> -       pand    %xmm0, %xmm3
> -       sub     $32, %rcx
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -
> -       movdqa  64(%rsi), %xmm3
> -       palignr $15, 48(%rsi), %xmm3
> -       sbb     $0xffff, %edx
> -       movdqa  48(%rsi), %xmm0
> -       palignr $15, 32(%rsi), %xmm0
> -       pcmpeqb 32(%rdi), %xmm0
> -       lea     32(%rsi), %rsi
> -       pcmpeqb 48(%rdi), %xmm3
> -
> -       lea     32(%rdi), %rdi
> -       jz      L(shr_15_gobble_loop)
> -       pand    %xmm0, %xmm3
> -
> -       cmp     $0, %rcx
> -       jge     L(shr_15_gobble_next)
> -       inc     %edx
> -       add     $32, %rcx
> -L(shr_15_gobble_next):
> -       test    %edx, %edx
> -       jnz     L(exit)
> -
> -       pmovmskb %xmm3, %edx
> -       movdqa  %xmm0, %xmm1
> -       lea     32(%rdi), %rdi
> -       lea     32(%rsi), %rsi
> -       sub     $0xffff, %edx
> -       jnz     L(exit)
> -
> -       lea     15(%rsi), %rsi
> -       add     %rcx, %rsi
> -       add     %rcx, %rdi
> -       jmp     L(less48bytes)
> -# endif
> -       .p2align 4
> -L(exit):
> -       pmovmskb %xmm1, %r8d
> -       sub     $0xffff, %r8d
> -       jz      L(first16bytes)
> -       lea     -16(%rsi), %rsi
> -       lea     -16(%rdi), %rdi
> -       mov     %r8d, %edx
> -L(first16bytes):
> -       add     %rax, %rsi
> -L(less16bytes):
> -# ifndef USE_AS_WMEMCMP
> -       test    %dl, %dl
> -       jz      L(next_24_bytes)
> -
> -       test    $0x01, %dl
> -       jnz     L(Byte16)
> -
> -       test    $0x02, %dl
> -       jnz     L(Byte17)
> -
> -       test    $0x04, %dl
> -       jnz     L(Byte18)
> -
> -       test    $0x08, %dl
> -       jnz     L(Byte19)
> -
> -       test    $0x10, %dl
> -       jnz     L(Byte20)
> -
> -       test    $0x20, %dl
> -       jnz     L(Byte21)
> -
> -       test    $0x40, %dl
> -       jnz     L(Byte22)
> -
> -       movzbl  -9(%rdi), %eax
> -       movzbl  -9(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte16):
> -       movzbl  -16(%rdi), %eax
> -       movzbl  -16(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte17):
> -       movzbl  -15(%rdi), %eax
> -       movzbl  -15(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte18):
> -       movzbl  -14(%rdi), %eax
> -       movzbl  -14(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte19):
> -       movzbl  -13(%rdi), %eax
> -       movzbl  -13(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte20):
> -       movzbl  -12(%rdi), %eax
> -       movzbl  -12(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte21):
> -       movzbl  -11(%rdi), %eax
> -       movzbl  -11(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(Byte22):
> -       movzbl  -10(%rdi), %eax
> -       movzbl  -10(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -
> -       .p2align 4
> -L(next_24_bytes):
> -       lea     8(%rdi), %rdi
> -       lea     8(%rsi), %rsi
> -       test    $0x01, %dh
> -       jnz     L(Byte16)
> -
> -       test    $0x02, %dh
> -       jnz     L(Byte17)
> -
> -       test    $0x04, %dh
> -       jnz     L(Byte18)
> -
> -       test    $0x08, %dh
> -       jnz     L(Byte19)
> -
> -       test    $0x10, %dh
> -       jnz     L(Byte20)
> -
> -       test    $0x20, %dh
> -       jnz     L(Byte21)
> -
> -       test    $0x40, %dh
> -       jnz     L(Byte22)
> -
> -       movzbl  -9(%rdi), %eax
> -       movzbl  -9(%rsi), %edx
> -       sub     %edx, %eax
> -       ret
> -# else
> -/* special for wmemcmp */
> -       xor     %eax, %eax
> -       test    %dl, %dl
> -       jz      L(next_two_double_words)
> -       and     $15, %dl
> -       jz      L(second_double_word)
> -       mov     -16(%rdi), %eax
> -       cmp     -16(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(second_double_word):
> -       mov     -12(%rdi), %eax
> -       cmp     -12(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(next_two_double_words):
> -       and     $15, %dh
> -       jz      L(fourth_double_word)
> -       mov     -8(%rdi), %eax
> -       cmp     -8(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -
> -       .p2align 4
> -L(fourth_double_word):
> -       mov     -4(%rdi), %eax
> -       cmp     -4(%rsi), %eax
> -       jne     L(find_diff)
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(less48bytes):
> -       cmp     $8, %ecx
> -       jae     L(more8bytes)
> -       cmp     $0, %ecx
> -       je      L(0bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $1, %ecx
> -       je      L(1bytes)
> -       cmp     $2, %ecx
> -       je      L(2bytes)
> -       cmp     $3, %ecx
> -       je      L(3bytes)
> -       cmp     $4, %ecx
> -       je      L(4bytes)
> -       cmp     $5, %ecx
> -       je      L(5bytes)
> -       cmp     $6, %ecx
> -       je      L(6bytes)
> -       jmp     L(7bytes)
> -# else
> -       jmp     L(4bytes)
> -# endif
> -
> -       .p2align 4
> -L(more8bytes):
> -       cmp     $16, %ecx
> -       jae     L(more16bytes)
> -       cmp     $8, %ecx
> -       je      L(8bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $9, %ecx
> -       je      L(9bytes)
> -       cmp     $10, %ecx
> -       je      L(10bytes)
> -       cmp     $11, %ecx
> -       je      L(11bytes)
> -       cmp     $12, %ecx
> -       je      L(12bytes)
> -       cmp     $13, %ecx
> -       je      L(13bytes)
> -       cmp     $14, %ecx
> -       je      L(14bytes)
> -       jmp     L(15bytes)
> -# else
> -       jmp     L(12bytes)
> -# endif
> -
> -       .p2align 4
> -L(more16bytes):
> -       cmp     $24, %ecx
> -       jae     L(more24bytes)
> -       cmp     $16, %ecx
> -       je      L(16bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $17, %ecx
> -       je      L(17bytes)
> -       cmp     $18, %ecx
> -       je      L(18bytes)
> -       cmp     $19, %ecx
> -       je      L(19bytes)
> -       cmp     $20, %ecx
> -       je      L(20bytes)
> -       cmp     $21, %ecx
> -       je      L(21bytes)
> -       cmp     $22, %ecx
> -       je      L(22bytes)
> -       jmp     L(23bytes)
> -# else
> -       jmp     L(20bytes)
> -# endif
> -
> -       .p2align 4
> -L(more24bytes):
> -       cmp     $32, %ecx
> -       jae     L(more32bytes)
> -       cmp     $24, %ecx
> -       je      L(24bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $25, %ecx
> -       je      L(25bytes)
> -       cmp     $26, %ecx
> -       je      L(26bytes)
> -       cmp     $27, %ecx
> -       je      L(27bytes)
> -       cmp     $28, %ecx
> -       je      L(28bytes)
> -       cmp     $29, %ecx
> -       je      L(29bytes)
> -       cmp     $30, %ecx
> -       je      L(30bytes)
> -       jmp     L(31bytes)
> -# else
> -       jmp     L(28bytes)
> -# endif
> -
> -       .p2align 4
> -L(more32bytes):
> -       cmp     $40, %ecx
> -       jae     L(more40bytes)
> -       cmp     $32, %ecx
> -       je      L(32bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $33, %ecx
> -       je      L(33bytes)
> -       cmp     $34, %ecx
> -       je      L(34bytes)
> -       cmp     $35, %ecx
> -       je      L(35bytes)
> -       cmp     $36, %ecx
> -       je      L(36bytes)
> -       cmp     $37, %ecx
> -       je      L(37bytes)
> -       cmp     $38, %ecx
> -       je      L(38bytes)
> -       jmp     L(39bytes)
> -# else
> -       jmp     L(36bytes)
> -# endif
> -
> -       .p2align 4
> -L(more40bytes):
> -       cmp     $40, %ecx
> -       je      L(40bytes)
> -# ifndef USE_AS_WMEMCMP
> -       cmp     $41, %ecx
> -       je      L(41bytes)
> -       cmp     $42, %ecx
> -       je      L(42bytes)
> -       cmp     $43, %ecx
> -       je      L(43bytes)
> -       cmp     $44, %ecx
> -       je      L(44bytes)
> -       cmp     $45, %ecx
> -       je      L(45bytes)
> -       cmp     $46, %ecx
> -       je      L(46bytes)
> -       jmp     L(47bytes)
> -
> -       .p2align 4
> -L(44bytes):
> -       movl    -44(%rdi), %eax
> -       movl    -44(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(40bytes):
> -       movl    -40(%rdi), %eax
> -       movl    -40(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(36bytes):
> -       movl    -36(%rdi), %eax
> -       movl    -36(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(32bytes):
> -       movl    -32(%rdi), %eax
> -       movl    -32(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(28bytes):
> -       movl    -28(%rdi), %eax
> -       movl    -28(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(24bytes):
> -       movl    -24(%rdi), %eax
> -       movl    -24(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(20bytes):
> -       movl    -20(%rdi), %eax
> -       movl    -20(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(16bytes):
> -       movl    -16(%rdi), %eax
> -       movl    -16(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(12bytes):
> -       movl    -12(%rdi), %eax
> -       movl    -12(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(8bytes):
> -       movl    -8(%rdi), %eax
> -       movl    -8(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(4bytes):
> -       movl    -4(%rdi), %eax
> -       movl    -4(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(0bytes):
> -       xor     %eax, %eax
> -       ret
> -# else
> -       .p2align 4
> -L(44bytes):
> -       movl    -44(%rdi), %eax
> -       cmp     -44(%rsi), %eax
> -       jne     L(find_diff)
> -L(40bytes):
> -       movl    -40(%rdi), %eax
> -       cmp     -40(%rsi), %eax
> -       jne     L(find_diff)
> -L(36bytes):
> -       movl    -36(%rdi), %eax
> -       cmp     -36(%rsi), %eax
> -       jne     L(find_diff)
> -L(32bytes):
> -       movl    -32(%rdi), %eax
> -       cmp     -32(%rsi), %eax
> -       jne     L(find_diff)
> -L(28bytes):
> -       movl    -28(%rdi), %eax
> -       cmp     -28(%rsi), %eax
> -       jne     L(find_diff)
> -L(24bytes):
> -       movl    -24(%rdi), %eax
> -       cmp     -24(%rsi), %eax
> -       jne     L(find_diff)
> -L(20bytes):
> -       movl    -20(%rdi), %eax
> -       cmp     -20(%rsi), %eax
> -       jne     L(find_diff)
> -L(16bytes):
> -       movl    -16(%rdi), %eax
> -       cmp     -16(%rsi), %eax
> -       jne     L(find_diff)
> -L(12bytes):
> -       movl    -12(%rdi), %eax
> -       cmp     -12(%rsi), %eax
> -       jne     L(find_diff)
> -L(8bytes):
> -       movl    -8(%rdi), %eax
> -       cmp     -8(%rsi), %eax
> -       jne     L(find_diff)
> -L(4bytes):
> -       movl    -4(%rdi), %eax
> -       cmp     -4(%rsi), %eax
> -       jne     L(find_diff)
> -L(0bytes):
> -       xor     %eax, %eax
> -       ret
> -# endif
> -
> -# ifndef USE_AS_WMEMCMP
> -       .p2align 4
> -L(45bytes):
> -       movl    -45(%rdi), %eax
> -       movl    -45(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(41bytes):
> -       movl    -41(%rdi), %eax
> -       movl    -41(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(37bytes):
> -       movl    -37(%rdi), %eax
> -       movl    -37(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(33bytes):
> -       movl    -33(%rdi), %eax
> -       movl    -33(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(29bytes):
> -       movl    -29(%rdi), %eax
> -       movl    -29(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(25bytes):
> -       movl    -25(%rdi), %eax
> -       movl    -25(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(21bytes):
> -       movl    -21(%rdi), %eax
> -       movl    -21(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(17bytes):
> -       movl    -17(%rdi), %eax
> -       movl    -17(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(13bytes):
> -       movl    -13(%rdi), %eax
> -       movl    -13(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(9bytes):
> -       movl    -9(%rdi), %eax
> -       movl    -9(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(5bytes):
> -       movl    -5(%rdi), %eax
> -       movl    -5(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(1bytes):
> -       movzbl  -1(%rdi), %eax
> -       cmpb    -1(%rsi), %al
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(46bytes):
> -       movl    -46(%rdi), %eax
> -       movl    -46(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(42bytes):
> -       movl    -42(%rdi), %eax
> -       movl    -42(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(38bytes):
> -       movl    -38(%rdi), %eax
> -       movl    -38(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(34bytes):
> -       movl    -34(%rdi), %eax
> -       movl    -34(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(30bytes):
> -       movl    -30(%rdi), %eax
> -       movl    -30(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(26bytes):
> -       movl    -26(%rdi), %eax
> -       movl    -26(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(22bytes):
> -       movl    -22(%rdi), %eax
> -       movl    -22(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(18bytes):
> -       movl    -18(%rdi), %eax
> -       movl    -18(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(14bytes):
> -       movl    -14(%rdi), %eax
> -       movl    -14(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(10bytes):
> -       movl    -10(%rdi), %eax
> -       movl    -10(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(6bytes):
> -       movl    -6(%rdi), %eax
> -       movl    -6(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(2bytes):
> -       movzwl  -2(%rdi), %eax
> -       movzwl  -2(%rsi), %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmp     %ecx, %eax
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(47bytes):
> -       movl    -47(%rdi), %eax
> -       movl    -47(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(43bytes):
> -       movl    -43(%rdi), %eax
> -       movl    -43(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(39bytes):
> -       movl    -39(%rdi), %eax
> -       movl    -39(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(35bytes):
> -       movl    -35(%rdi), %eax
> -       movl    -35(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(31bytes):
> -       movl    -31(%rdi), %eax
> -       movl    -31(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(27bytes):
> -       movl    -27(%rdi), %eax
> -       movl    -27(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(23bytes):
> -       movl    -23(%rdi), %eax
> -       movl    -23(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(19bytes):
> -       movl    -19(%rdi), %eax
> -       movl    -19(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(15bytes):
> -       movl    -15(%rdi), %eax
> -       movl    -15(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(11bytes):
> -       movl    -11(%rdi), %eax
> -       movl    -11(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(7bytes):
> -       movl    -7(%rdi), %eax
> -       movl    -7(%rsi), %ecx
> -       cmp     %ecx, %eax
> -       jne     L(find_diff)
> -L(3bytes):
> -       movzwl  -3(%rdi), %eax
> -       movzwl  -3(%rsi), %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmp     %ecx, %eax
> -       jne     L(set)
> -       movzbl  -1(%rdi), %eax
> -       cmpb    -1(%rsi), %al
> -       jne     L(set)
> -       xor     %eax, %eax
> -       ret
> -
> -       .p2align 4
> -L(find_diff):
> -       cmpb    %cl, %al
> -       jne     L(set)
> -       cmpw    %cx, %ax
> -       jne     L(set)
> -       shr     $16, %eax
> -       shr     $16, %ecx
> -       cmpb    %cl, %al
> -       jne     L(set)
> -
> -/* We get there only if we already know there is a
> -difference.  */
> -
> -       cmp     %ecx, %eax
> -L(set):
> -       sbb     %eax, %eax
> -       sbb     $-1, %eax
> -       ret
> -# else
> -
> -/* for wmemcmp */
> -       .p2align 4
> -L(find_diff):
> -       mov     $1, %eax
> -       jg      L(find_diff_bigger)
> -       neg     %eax
> -       ret
> -
> -       .p2align 4
> -L(find_diff_bigger):
> -       ret
> -# endif
> -
> -       .p2align 4
> -L(equal):
> -       xor     %eax, %eax
> -       ret
> -
> -END (MEMCMP)
> -#endif
> diff --git a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S b/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> deleted file mode 100644
> index a41ef95fc1..0000000000
> --- a/sysdeps/x86_64/multiarch/wmemcmp-ssse3.S
> +++ /dev/null
> @@ -1,4 +0,0 @@
> -#define USE_AS_WMEMCMP 1
> -#define MEMCMP __wmemcmp_ssse3
> -
> -#include "memcmp-ssse3.S"
> --
> 2.25.1
>

LGTM.

Reviewed-by: H.J. Lu <hjl.tools@gmail.com>

Thanks.

-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
                   ` (5 preceding siblings ...)
  2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
@ 2022-03-25 20:34 ` Andreas Schwab
  2022-03-25 20:40   ` Noah Goldstein
  6 siblings, 1 reply; 56+ messages in thread
From: Andreas Schwab @ 2022-03-25 20:34 UTC (permalink / raw)
  To: Noah Goldstein via Libc-alpha

On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:

> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.

I think the second sentence is missing something.  Also: s/its/it is/.

-- 
Andreas Schwab, schwab@linux-m68k.org
GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
"And now for something completely different."

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3
  2022-03-25 20:34 ` Andreas Schwab
@ 2022-03-25 20:40   ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-25 20:40 UTC (permalink / raw)
  To: Andreas Schwab; +Cc: Noah Goldstein via Libc-alpha

On Fri, Mar 25, 2022 at 3:34 PM Andreas Schwab <schwab@linux-m68k.org> wrote:
>
> On Mär 25 2022, Noah Goldstein via Libc-alpha wrote:
>
> > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > SSSE3. As a result its no longer with the code size cost.
>
> I think the second sentence is missing something.  Also: s/its/it is/.
^
Hows:

"As a result it is no longer worth it to keep the SSSE3 versions given
the code size cost."

>
> --
> Andreas Schwab, schwab@linux-m68k.org
> GPG Key fingerprint = 7578 EB47 D4E5 4D69 2510  2552 DF73 E780 A9DA AEC1
> "And now for something completely different."

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
@ 2022-03-28  8:10 Mayshao-oc
  2022-03-28 13:07 ` H.J. Lu
  0 siblings, 1 reply; 56+ messages in thread
From: Mayshao-oc @ 2022-03-28  8:10 UTC (permalink / raw)
  To: goldstein.w.n
  Cc: GNU C Library, H.J. Lu, Florian Weimer, Carlos O'Donell,
	Louis Qi(BJ-RD)

On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:

> With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> SSSE3. As a result its no longer with the code size cost.
> ---
> sysdeps/x86_64/multiarch/Makefile          |    2 -
> sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> 5 files changed, 7 insertions(+), 3183 deletions(-)
> delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S


On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
is better than that of AVX2, and the current computer system has sufficient
disk capacity and memory capacity.

It is strongly recommended to keep the SSSE3 version.

Best Regards,
May Shao



^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-28  8:10 [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Mayshao-oc
@ 2022-03-28 13:07 ` H.J. Lu
  2022-03-29  2:51   ` Mayshao-oc
  0 siblings, 1 reply; 56+ messages in thread
From: H.J. Lu @ 2022-03-28 13:07 UTC (permalink / raw)
  To: Mayshao-oc
  Cc: goldstein.w.n, GNU C Library, Florian Weimer,
	Carlos O'Donell, Louis Qi(BJ-RD)

On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
> On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > SSSE3. As a result its no longer with the code size cost.
> > ---
> > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
>
> On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> is better than that of AVX2, and the current computer system has sufficient
> disk capacity and memory capacity.

How does the SSSE3 version compare against the SSE2 version?

> It is strongly recommended to keep the SSSE3 version.
>


-- 
H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-28 13:07 ` H.J. Lu
@ 2022-03-29  2:51   ` Mayshao-oc
  2022-03-29  2:57     ` Noah Goldstein
  0 siblings, 1 reply; 56+ messages in thread
From: Mayshao-oc @ 2022-03-29  2:51 UTC (permalink / raw)
  To: H.J. Lu
  Cc: goldstein.w.n, GNU C Library, Florian Weimer,
	Carlos O'Donell, Louis Qi(BJ-RD)

On Mon, Mar 28, 2022 at  9:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:

> On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> >
> > On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> >
> > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > > SSSE3. As a result its no longer with the code size cost.
> > > ---
> > > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
> >
> > > On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> > is better than that of AVX2, and the current computer system has sufficient
> > disk capacity and memory capacity.
>
> How does the SSSE3 version compare against the SSE2 version?

On some Zhaoxin processors, the overall performance of SSSE3 is about
10% higher than that of SSE2.


Best Regards,
May Shao

> > It is strongly recommended to keep the SSSE3 version.
> >
>
>
> --
> H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-29  2:51   ` Mayshao-oc
@ 2022-03-29  2:57     ` Noah Goldstein
  2022-03-30  9:56       ` Mayshao-oc
  0 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-29  2:57 UTC (permalink / raw)
  To: Mayshao-oc
  Cc: H.J. Lu, GNU C Library, Florian Weimer, Carlos O'Donell,
	Louis Qi(BJ-RD)

On Mon, Mar 28, 2022 at 9:51 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
> On Mon, Mar 28, 2022 at  9:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
>
>
> > On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > >
> > > On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > >
> > > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > > > SSSE3. As a result its no longer with the code size cost.
> > > > ---
> > > > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > > > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > > > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > > > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > > > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > > > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > > > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
> > >
> > > > On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> > > is better than that of AVX2, and the current computer system has sufficient
> > > disk capacity and memory capacity.
> >
> > How does the SSSE3 version compare against the SSE2 version?
>
> On some Zhaoxin processors, the overall performance of SSSE3 is about
> 10% higher than that of SSE2.
>
>
> Best Regards,
> May Shao

Any chance you can post the result from running `bench-memset` or some
equivalent benchmark? Curious where the regressions are. Ideally we would
fix the SSE2 version so its optimal.
>
> > > It is strongly recommended to keep the SSSE3 version.
> > >
> >
> >
> > --
> > H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-29  2:57     ` Noah Goldstein
@ 2022-03-30  9:56       ` Mayshao-oc
  2022-03-30 16:45         ` Noah Goldstein
  0 siblings, 1 reply; 56+ messages in thread
From: Mayshao-oc @ 2022-03-30  9:56 UTC (permalink / raw)
  To: Noah Goldstein
  Cc: H.J. Lu, GNU C Library, Florian Weimer, Carlos O'Donell,
	Louis Qi(BJ-RD)

[-- Attachment #1: Type: text/plain, Size: 2145 bytes --]

On Tue, Mar 29, 2022 at 10:57 AM Noah Goldstein<goldstein.w.n@gmail.com> wrote:


>On Mon, Mar 28, 2022 at 9:51 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> >
> > On Mon, Mar 28, 2022 at  9:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> >
> >
> > > On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > > >
> > > > On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > >
> > > > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > > > > SSSE3. As a result its no longer with the code size cost.
> > > > > ---
> > > > > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > > > > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > > > > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > > > > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > > > > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > > > > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > > > > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
> > > >
> > > > On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> > > > is better than that of AVX2, and the current computer system has sufficient
> > > > disk capacity and memory capacity.
> > >
> > > How does the SSSE3 version compare against the SSE2 version?
> >
> > On some Zhaoxin processors, the overall performance of SSSE3 is about
> > 10% higher than that of SSE2.
> >
> >
> > Best Regards,
> > May Shao
>
> Any chance you can post the result from running `bench-memset` or some
> equivalent benchmark? Curious where the regressions are. Ideally we would
> fix the SSE2 version so its optimal.

Bench-memcpy on Zhaoxin KX-6000 processor shows that, when length <=4 or
length >= 128, memcpy SSSE3 can achieve an average performance improvement
of 25% compared to SSSE2.

I have attached the test results, hope this is what you want to see.

> > > > It is strongly recommended to keep the SSSE3 version.
> > > >
> > >
> > >
> > > --
> > > H.J.

[-- Attachment #2: bench-memcpy.pdf --]
[-- Type: application/pdf, Size: 238958 bytes --]

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-30  9:56       ` Mayshao-oc
@ 2022-03-30 16:45         ` Noah Goldstein
  2022-03-30 16:54           ` Noah Goldstein
  0 siblings, 1 reply; 56+ messages in thread
From: Noah Goldstein @ 2022-03-30 16:45 UTC (permalink / raw)
  To: Mayshao-oc
  Cc: H.J. Lu, GNU C Library, Florian Weimer, Carlos O'Donell,
	Louis Qi(BJ-RD)

On Wed, Mar 30, 2022 at 4:57 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
>
> On Tue, Mar 29, 2022 at 10:57 AM Noah Goldstein<goldstein.w.n@gmail.com> wrote:
>
>
> >On Mon, Mar 28, 2022 at 9:51 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > >
> > > On Mon, Mar 28, 2022 at  9:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > >
> > >
> > > > On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > > > >
> > > > > On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > >
> > > > > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > > > > > SSSE3. As a result its no longer with the code size cost.
> > > > > > ---
> > > > > > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > > > > > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > > > > > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > > > > > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > > > > > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > > > > > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > > > > > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
> > > > >
> > > > > On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> > > > > is better than that of AVX2, and the current computer system has sufficient
> > > > > disk capacity and memory capacity.
> > > >
> > > > How does the SSSE3 version compare against the SSE2 version?
> > >
> > > On some Zhaoxin processors, the overall performance of SSSE3 is about
> > > 10% higher than that of SSE2.
> > >
> > >
> > > Best Regards,
> > > May Shao
> >
> > Any chance you can post the result from running `bench-memset` or some
> > equivalent benchmark? Curious where the regressions are. Ideally we would
> > fix the SSE2 version so its optimal.
>
> Bench-memcpy on Zhaoxin KX-6000 processor shows that, when length <=4 or
> length >= 128, memcpy SSSE3 can achieve an average performance improvement
> of 25% compared to SSSE2.

Thanks

The size <= 4 regression is expected as profiles of SPEC show the [5, 32] sized
copies to significantly hotter.

Regarding the large sizes, it seems to be because the SSSE3 version avoids
unaligned loads/stores much more aggressively.

For now we will keep the function. Will look into a replacement that isn't so
costly to code size.

Out of curiosity, is bench-memcpy-random performance also improved with
SSSE3? The jump table / branches generally look really nice in micro-benchmarks
but that may not be fully indicative of how it will performance in an
application.
>
> I have attached the test results, hope this is what you want to see.
>
> > > > > It is strongly recommended to keep the SSSE3 version.
> > > > >
> > > >
> > > >
> > > > --
> > > > H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

* Re: [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3
  2022-03-30 16:45         ` Noah Goldstein
@ 2022-03-30 16:54           ` Noah Goldstein
  0 siblings, 0 replies; 56+ messages in thread
From: Noah Goldstein @ 2022-03-30 16:54 UTC (permalink / raw)
  To: Mayshao-oc
  Cc: H.J. Lu, GNU C Library, Florian Weimer, Carlos O'Donell,
	Louis Qi(BJ-RD)

On Wed, Mar 30, 2022 at 11:45 AM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> On Wed, Mar 30, 2022 at 4:57 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> >
> > On Tue, Mar 29, 2022 at 10:57 AM Noah Goldstein<goldstein.w.n@gmail.com> wrote:
> >
> >
> > >On Mon, Mar 28, 2022 at 9:51 PM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > > >
> > > > On Mon, Mar 28, 2022 at  9:07 PM H.J. Lu <hjl.tools@gmail.com> wrote:
> > > >
> > > >
> > > > > On Mon, Mar 28, 2022 at 1:10 AM Mayshao-oc <Mayshao-oc@zhaoxin.com> wrote:
> > > > > >
> > > > > > On Fri, Mar 25, 2022 at 6:36 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
> > > > > >
> > > > > > > With SSE2, SSE4.1, AVX2, and EVEX versions very few targets prefer
> > > > > > > SSSE3. As a result its no longer with the code size cost.
> > > > > > > ---
> > > > > > > sysdeps/x86_64/multiarch/Makefile          |    2 -
> > > > > > > sysdeps/x86_64/multiarch/ifunc-impl-list.c |   15 -
> > > > > > > sysdeps/x86_64/multiarch/ifunc-memmove.h   |   18 +-
> > > > > > > sysdeps/x86_64/multiarch/memcpy-ssse3.S    | 3151 --------------------
> > > > > > > sysdeps/x86_64/multiarch/memmove-ssse3.S   |    4 -
> > > > > > > 5 files changed, 7 insertions(+), 3183 deletions(-)
> > > > > > > delete mode 100644 sysdeps/x86_64/multiarch/memcpy-ssse3.S
> > > > > > > delete mode 100644 sysdeps/x86_64/multiarch/memmove-ssse3.S
> > > > > >
> > > > > > On some platforms, such as Zhaoxin, the memcpy performance of SSSE3
> > > > > > is better than that of AVX2, and the current computer system has sufficient
> > > > > > disk capacity and memory capacity.
> > > > >
> > > > > How does the SSSE3 version compare against the SSE2 version?
> > > >
> > > > On some Zhaoxin processors, the overall performance of SSSE3 is about
> > > > 10% higher than that of SSE2.
> > > >
> > > >
> > > > Best Regards,
> > > > May Shao
> > >
> > > Any chance you can post the result from running `bench-memset` or some
> > > equivalent benchmark? Curious where the regressions are. Ideally we would
> > > fix the SSE2 version so its optimal.
> >
> > Bench-memcpy on Zhaoxin KX-6000 processor shows that, when length <=4 or
> > length >= 128, memcpy SSSE3 can achieve an average performance improvement
> > of 25% compared to SSSE2.
>
> Thanks
>
> The size <= 4 regression is expected as profiles of SPEC show the [5, 32] sized
> copies to significantly hotter.
>
> Regarding the large sizes, it seems to be because the SSSE3 version avoids
> unaligned loads/stores much more aggressively.
>
> For now we will keep the function. Will look into a replacement that isn't so
> costly to code size.
>
> Out of curiosity, is bench-memcpy-random performance also improved with
> SSSE3? The jump table / branches generally look really nice in micro-benchmarks
> but that may not be fully indicative of how it will performance in an
> application.
> >
> > I have attached the test results, hope this is what you want to see.
> >
> > > > > > It is strongly recommended to keep the SSSE3 version.

Will you guys have any issues if we upgrade the unaligned memcpy
to sse4.1? That will allow us to use `pshufb` and get rid of the jump
table and excessive code size.

> > > > > >
> > > > >
> > > > >
> > > > > --
> > > > > H.J.

^ permalink raw reply	[flat|nested] 56+ messages in thread

end of thread, other threads:[~2022-04-14 18:13 UTC | newest]

Thread overview: 56+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-03-25 18:36 [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 18:36 ` [PATCH v1 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 19:55   ` H.J. Lu
2022-03-25 20:44   ` [PATCH v2 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-03-25 20:44     ` [PATCH v2 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-03-25 20:44     ` [PATCH v2 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-04-10  0:57       ` [PATCH v4 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-03-25 20:44     ` [PATCH v2 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 20:44     ` [PATCH v2 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 20:44     ` [PATCH v2 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 1/6] " Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 2/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 3/6] x86: Reduce code size of Remove mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 4/6] x86: Optimize memcmp SSE2 in memcmp.S Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 5/6] x86: Remove memcmp-sse4.S Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:42   ` [PATCH v3 6/6] x86: Cleanup page cross code in memcmp-avx2-movbe.S Noah Goldstein
2022-04-10  0:48     ` Noah Goldstein
2022-04-10  0:54   ` [PATCH v4 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-10  0:54     ` [PATCH v4 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-10  0:54     ` [PATCH v4 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-10  0:54     ` [PATCH v4 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-10  0:54     ` [PATCH v4 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 16:47   ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 Noah Goldstein
2022-04-14 16:47     ` [PATCH v5 2/6] x86: Remove str{n}{case}cmp-ssse3 Noah Goldstein
2022-04-14 18:05       ` H.J. Lu
2022-04-14 16:47     ` [PATCH v5 3/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-04-14 18:06       ` H.J. Lu
2022-04-14 16:47     ` [PATCH v5 4/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-04-14 18:10       ` H.J. Lu
2022-04-14 16:47     ` [PATCH v5 5/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-04-14 18:13       ` H.J. Lu
2022-04-14 16:47     ` [PATCH v5 6/6] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein
2022-04-14 18:04     ` [PATCH v5 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 18:36 ` [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Noah Goldstein
2022-03-25 19:56   ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 4/6] x86: Remove mem{move|cpy}-ssse3-back Noah Goldstein
2022-03-25 19:56   ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 5/6] x86: Remove str{n}cat-ssse3 Noah Goldstein
2022-03-25 19:57   ` H.J. Lu
2022-03-25 18:36 ` [PATCH v1 6/6] x86: Remove str{p}{n}cpy-ssse3 Noah Goldstein
2022-03-25 19:57   ` H.J. Lu
2022-03-25 19:54 ` [PATCH v1 1/6] x86: Remove {w}memcmp-ssse3 H.J. Lu
2022-03-25 20:34 ` Andreas Schwab
2022-03-25 20:40   ` Noah Goldstein
2022-03-28  8:10 [PATCH v1 3/6] x86: Remove mem{move|cpy}-ssse3 Mayshao-oc
2022-03-28 13:07 ` H.J. Lu
2022-03-29  2:51   ` Mayshao-oc
2022-03-29  2:57     ` Noah Goldstein
2022-03-30  9:56       ` Mayshao-oc
2022-03-30 16:45         ` Noah Goldstein
2022-03-30 16:54           ` Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).