public inbox for glibc-cvs@sourceware.org
help / color / mirror / Atom feed
* [glibc] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
@ 2022-04-15  5:02 Noah Goldstein
  0 siblings, 0 replies; only message in thread
From: Noah Goldstein @ 2022-04-15  5:02 UTC (permalink / raw)
  To: glibc-cvs

https://sourceware.org/git/gitweb.cgi?p=glibc.git;h=26b2478322db94edc9e0e8f577b2f71d291e5acb

commit 26b2478322db94edc9e0e8f577b2f71d291e5acb
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Thu Apr 14 11:47:40 2022 -0500

    x86: Reduce code size of mem{move|pcpy|cpy}-ssse3
    
    The goal is to remove most SSSE3 function as SSE4, AVX2, and EVEX are
    generally preferable. memcpy/memmove is one exception where avoiding
    unaligned loads with `palignr` is important for some targets.
    
    This commit replaces memmove-ssse3 with a better optimized are lower
    code footprint verion. As well it aliases memcpy to memmove.
    
    Aside from this function all other SSSE3 functions should be safe to
    remove.
    
    The performance is not changed drastically although shows overall
    improvements without any major regressions or gains.
    
    bench-memcpy geometric_mean(N=50) New / Original: 0.957
    
    bench-memcpy-random geometric_mean(N=50) New / Original: 0.912
    
    bench-memcpy-large geometric_mean(N=50) New / Original: 0.892
    
    Benchmarks where run on Zhaoxin KX-6840@2000MHz See attached numbers
    for all results.
    
    More important this saves 7246 bytes of code size in memmove an
    additional 10741 bytes by reusing memmove code for memcpy (total 17987
    bytes saves). As well an additional 896 bytes of rodata for the jump
    table entries.

Diff:
---
 sysdeps/x86_64/multiarch/Makefile        |    1 -
 sysdeps/x86_64/multiarch/memcpy-ssse3.S  | 3151 ------------------------------
 sysdeps/x86_64/multiarch/memmove-ssse3.S |  384 +++-
 3 files changed, 380 insertions(+), 3156 deletions(-)

diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 303fb5d734..e7ea963fc0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -16,7 +16,6 @@ sysdep_routines += \
   memcmpeq-avx2-rtm \
   memcmpeq-evex \
   memcmpeq-sse2 \
-  memcpy-ssse3 \
   memmove-avx-unaligned-erms \
   memmove-avx-unaligned-erms-rtm \
   memmove-avx512-no-vzeroupper \
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
deleted file mode 100644
index 65644d3a09..0000000000
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ /dev/null
@@ -1,3151 +0,0 @@
-/* memcpy with SSSE3
-   Copyright (C) 2010-2022 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-#if IS_IN (libc)
-
-#include "asm-syntax.h"
-
-#ifndef MEMCPY
-# define MEMCPY		__memcpy_ssse3
-# define MEMCPY_CHK	__memcpy_chk_ssse3
-# define MEMPCPY	__mempcpy_ssse3
-# define MEMPCPY_CHK	__mempcpy_chk_ssse3
-#endif
-
-#define JMPTBL(I, B)	I - B
-
-/* Branch to an entry in a jump table.  TABLE is a jump table with
-   relative offsets.  INDEX is a register contains the index into the
-   jump table.  SCALE is the scale of INDEX.  */
-#define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE)		\
-  lea		TABLE(%rip), %r11;				\
-  movslq	(%r11, INDEX, SCALE), INDEX;			\
-  lea		(%r11, INDEX), INDEX;				\
-  _CET_NOTRACK jmp *INDEX;					\
-  ud2
-
-	.section .text.ssse3,"ax",@progbits
-#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
-ENTRY (MEMPCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMPCPY_CHK)
-
-ENTRY (MEMPCPY)
-	mov	%RDI_LP, %RAX_LP
-	add	%RDX_LP, %RAX_LP
-	jmp	L(start)
-END (MEMPCPY)
-#endif
-
-#if !defined USE_AS_BCOPY
-ENTRY (MEMCPY_CHK)
-	cmp	%RDX_LP, %RCX_LP
-	jb	HIDDEN_JUMPTARGET (__chk_fail)
-END (MEMCPY_CHK)
-#endif
-
-ENTRY (MEMCPY)
-	mov	%RDI_LP, %RAX_LP
-#ifdef USE_AS_MEMPCPY
-	add	%RDX_LP, %RAX_LP
-#endif
-
-#ifdef __ILP32__
-	/* Clear the upper 32 bits.  */
-	mov	%edx, %edx
-#endif
-
-#ifdef USE_AS_MEMMOVE
-	cmp	%rsi, %rdi
-	jb	L(copy_forward)
-	je	L(write_0bytes)
-	cmp	$79, %rdx
-	jbe	L(copy_forward)
-	jmp	L(copy_backward)
-L(copy_forward):
-#endif
-L(start):
-	cmp	$79, %rdx
-	lea     L(table_less_80bytes)(%rip), %r11
-	ja	L(80bytesormore)
-	movslq	(%r11, %rdx, 4), %r9
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	add	%r11, %r9
-	_CET_NOTRACK jmp *%r9
-	ud2
-
-	.p2align 4
-L(80bytesormore):
-#ifndef USE_AS_MEMMOVE
-	cmp	%dil, %sil
-	jle	L(copy_backward)
-#endif
-
-	movdqu	(%rsi), %xmm0
-	mov	%rdi, %rcx
-	and	$-16, %rdi
-	add	$16, %rdi
-	mov	%rcx, %r8
-	sub	%rdi, %rcx
-	add	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_fwd)
-	and	$0xf, %r9
-	jz	L(shl_0)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table), %r9, 4)
-
-	.p2align 4
-L(copy_backward):
-	movdqu	-16(%rsi, %rdx), %xmm0
-	add	%rdx, %rsi
-	lea	-16(%rdi, %rdx), %r8
-	add	%rdx, %rdi
-
-	mov	%rdi, %rcx
-	and	$0xf, %rcx
-	xor	%rcx, %rdi
-	sub	%rcx, %rdx
-	sub	%rcx, %rsi
-
-#ifdef SHARED_CACHE_SIZE_HALF
-	mov	$SHARED_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_shared_cache_size_half(%rip), %RCX_LP
-#endif
-
-	cmp	%rcx, %rdx
-	mov	%rsi, %r9
-	ja	L(large_page_bwd)
-	and	$0xf, %r9
-	jz	L(shl_0_bwd)
-#ifdef DATA_CACHE_SIZE_HALF
-	mov	$DATA_CACHE_SIZE_HALF, %RCX_LP
-#else
-	mov	__x86_data_cache_size_half(%rip), %RCX_LP
-#endif
-	BRANCH_TO_JMPTBL_ENTRY (L(shl_table_bwd), %r9, 4)
-
-	.p2align 4
-L(shl_0):
-	sub	$16, %rdx
-	movdqa	(%rsi), %xmm1
-	add	$16, %rsi
-	movdqa	%xmm1, (%rdi)
-	add	$16, %rdi
-	cmp	$128, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes)
-	movaps	(%rsi), %xmm4
-	movaps	16(%rsi), %xmm1
-	movaps	32(%rsi), %xmm2
-	movaps	48(%rsi), %xmm3
-	movaps	%xmm4, (%rdi)
-	movaps	%xmm1, 16(%rdi)
-	movaps	%xmm2, 32(%rdi)
-	movaps	%xmm3, 48(%rdi)
-	sub	$64, %rdx
-	add	$64, %rsi
-	add	$64, %rdi
-L(shl_0_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_loop)
-L(shl_0_gobble_cache_loop):
-	movdqa	(%rsi), %xmm4
-	movaps	0x10(%rsi), %xmm1
-	movaps	0x20(%rsi), %xmm2
-	movaps	0x30(%rsi), %xmm3
-
-	movdqa	%xmm4, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-
-	sub	$128, %rdx
-	movaps	0x40(%rsi), %xmm4
-	movaps	0x50(%rsi), %xmm5
-	movaps	0x60(%rsi), %xmm6
-	movaps	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_cache_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_cache_less_64bytes)
-
-	movdqa	(%rsi), %xmm4
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm4, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm4
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm4, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_cache_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_loop):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x280(%rsi)
-
-	movdqa	(%rsi), %xmm0
-	movdqa	0x10(%rsi), %xmm1
-	movdqa	0x20(%rsi), %xmm2
-	movdqa	0x30(%rsi), %xmm3
-	movdqa	0x40(%rsi), %xmm4
-	movdqa	0x50(%rsi), %xmm5
-	movdqa	0x60(%rsi), %xmm6
-	movdqa	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	movdqa	%xmm2, 0x20(%rdi)
-	movdqa	%xmm3, 0x30(%rdi)
-	movdqa	%xmm4, 0x40(%rdi)
-	movdqa	%xmm5, 0x50(%rdi)
-	movdqa	%xmm6, 0x60(%rdi)
-	movdqa	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_less_64bytes)
-
-	movdqa	(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	0x10(%rsi), %xmm1
-
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-
-	movdqa	0x20(%rsi), %xmm0
-	movdqa	0x30(%rsi), %xmm1
-	add	$0x40, %rsi
-
-	movdqa	%xmm0, 0x20(%rdi)
-	movdqa	%xmm1, 0x30(%rdi)
-	add	$0x40, %rdi
-L(shl_0_mem_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_less_32bytes)
-	movdqa	(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	0x10(%rsi), %xmm1
-	add	$0x20, %rsi
-	movdqa	%xmm0, (%rdi)
-	movdqa	%xmm1, 0x10(%rdi)
-	add	$0x20, %rdi
-L(shl_0_mem_less_32bytes):
-	add	%rdx, %rdi
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_bwd):
-	sub	$16, %rdx
-	movdqa	-0x10(%rsi), %xmm1
-	sub	$16, %rsi
-	movdqa	%xmm1, -0x10(%rdi)
-	sub	$16, %rdi
-	cmp	$0x80, %rdx
-	movdqu	%xmm0, (%r8)
-	ja	L(shl_0_gobble_bwd)
-	cmp	$64, %rdx
-	jb	L(shl_0_less_64bytes_bwd)
-	movaps	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	sub	$64, %rdx
-	sub	$0x40, %rsi
-	sub	$0x40, %rdi
-L(shl_0_less_64bytes_bwd):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_bwd):
-#ifdef DATA_CACHE_SIZE_HALF
-	cmp	$DATA_CACHE_SIZE_HALF, %RDX_LP
-#else
-	cmp	__x86_data_cache_size_half(%rip), %RDX_LP
-#endif
-	lea	-128(%rdx), %rdx
-	jae	L(shl_0_gobble_mem_bwd_loop)
-L(shl_0_gobble_bwd_loop):
-	movdqa	-0x10(%rsi), %xmm0
-	movaps	-0x20(%rsi), %xmm1
-	movaps	-0x30(%rsi), %xmm2
-	movaps	-0x40(%rsi), %xmm3
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-
-	sub	$0x80, %rdx
-	movaps	-0x50(%rsi), %xmm4
-	movaps	-0x60(%rsi), %xmm5
-	movaps	-0x70(%rsi), %xmm6
-	movaps	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_gobble_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_gobble_bwd_less_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_0_gobble_mem_bwd_loop):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x280(%rsi)
-	movdqa	-0x10(%rsi), %xmm0
-	movdqa	-0x20(%rsi), %xmm1
-	movdqa	-0x30(%rsi), %xmm2
-	movdqa	-0x40(%rsi), %xmm3
-	movdqa	-0x50(%rsi), %xmm4
-	movdqa	-0x60(%rsi), %xmm5
-	movdqa	-0x70(%rsi), %xmm6
-	movdqa	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-	sub	$0x80, %rdx
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	movdqa	%xmm2, -0x30(%rdi)
-	movdqa	%xmm3, -0x40(%rdi)
-	movdqa	%xmm4, -0x50(%rdi)
-	movdqa	%xmm5, -0x60(%rdi)
-	movdqa	%xmm6, -0x70(%rdi)
-	movdqa	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-
-	jae	L(shl_0_gobble_mem_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(shl_0_mem_bwd_less_64bytes)
-
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x40, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-
-	movdqa	-0x30(%rsi), %xmm0
-	movdqa	-0x40(%rsi), %xmm1
-	sub	$0x40, %rsi
-
-	movdqa	%xmm0, -0x30(%rdi)
-	movdqa	%xmm1, -0x40(%rdi)
-	sub	$0x40, %rdi
-L(shl_0_mem_bwd_less_64bytes):
-	cmp	$0x20, %rdx
-	jb	L(shl_0_mem_bwd_less_32bytes)
-	movdqa	-0x10(%rsi), %xmm0
-	sub	$0x20, %rdx
-	movdqa	-0x20(%rsi), %xmm1
-	sub	$0x20, %rsi
-	movdqa	%xmm0, -0x10(%rdi)
-	movdqa	%xmm1, -0x20(%rdi)
-	sub	$0x20, %rdi
-L(shl_0_mem_bwd_less_32bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1):
-	lea	(L(shl_1_loop_L1)-L(shl_1))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_fwd)
-	lea	(L(shl_1_loop_L2)-L(shl_1_loop_L1))(%r9), %r9
-L(L1_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_1_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0f(%rsi), %xmm2
-	movaps	0x1f(%rsi), %xmm3
-	movaps	0x2f(%rsi), %xmm4
-	movaps	0x3f(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$1, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$1, %xmm3, %xmm4
-	palignr	$1, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$1, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_1_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_1_bwd):
-	lea	(L(shl_1_bwd_loop_L1)-L(shl_1_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x01(%rsi), %xmm1
-	jb	L(L1_bwd)
-	lea	(L(shl_1_bwd_loop_L2)-L(shl_1_bwd_loop_L1))(%r9), %r9
-L(L1_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_1_bwd_loop_L1):
-	movaps	-0x11(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x21(%rsi), %xmm3
-	movaps	-0x31(%rsi), %xmm4
-	movaps	-0x41(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$1, %xmm2, %xmm1
-	palignr	$1, %xmm3, %xmm2
-	palignr	$1, %xmm4, %xmm3
-	palignr	$1, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_1_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_1_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2):
-	lea	(L(shl_2_loop_L1)-L(shl_2))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_fwd)
-	lea	(L(shl_2_loop_L2)-L(shl_2_loop_L1))(%r9), %r9
-L(L2_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_2_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0e(%rsi), %xmm2
-	movaps	0x1e(%rsi), %xmm3
-	movaps	0x2e(%rsi), %xmm4
-	movaps	0x3e(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$2, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$2, %xmm3, %xmm4
-	palignr	$2, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$2, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_2_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_2_bwd):
-	lea	(L(shl_2_bwd_loop_L1)-L(shl_2_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x02(%rsi), %xmm1
-	jb	L(L2_bwd)
-	lea	(L(shl_2_bwd_loop_L2)-L(shl_2_bwd_loop_L1))(%r9), %r9
-L(L2_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_2_bwd_loop_L1):
-	movaps	-0x12(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x22(%rsi), %xmm3
-	movaps	-0x32(%rsi), %xmm4
-	movaps	-0x42(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$2, %xmm2, %xmm1
-	palignr	$2, %xmm3, %xmm2
-	palignr	$2, %xmm4, %xmm3
-	palignr	$2, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_2_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_2_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3):
-	lea	(L(shl_3_loop_L1)-L(shl_3))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_fwd)
-	lea	(L(shl_3_loop_L2)-L(shl_3_loop_L1))(%r9), %r9
-L(L3_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_3_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0d(%rsi), %xmm2
-	movaps	0x1d(%rsi), %xmm3
-	movaps	0x2d(%rsi), %xmm4
-	movaps	0x3d(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$3, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$3, %xmm3, %xmm4
-	palignr	$3, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$3, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_3_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_3_bwd):
-	lea	(L(shl_3_bwd_loop_L1)-L(shl_3_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x03(%rsi), %xmm1
-	jb	L(L3_bwd)
-	lea	(L(shl_3_bwd_loop_L2)-L(shl_3_bwd_loop_L1))(%r9), %r9
-L(L3_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_3_bwd_loop_L1):
-	movaps	-0x13(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x23(%rsi), %xmm3
-	movaps	-0x33(%rsi), %xmm4
-	movaps	-0x43(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$3, %xmm2, %xmm1
-	palignr	$3, %xmm3, %xmm2
-	palignr	$3, %xmm4, %xmm3
-	palignr	$3, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_3_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_3_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4):
-	lea	(L(shl_4_loop_L1)-L(shl_4))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_fwd)
-	lea	(L(shl_4_loop_L2)-L(shl_4_loop_L1))(%r9), %r9
-L(L4_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_4_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0c(%rsi), %xmm2
-	movaps	0x1c(%rsi), %xmm3
-	movaps	0x2c(%rsi), %xmm4
-	movaps	0x3c(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$4, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$4, %xmm3, %xmm4
-	palignr	$4, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$4, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_4_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_4_bwd):
-	lea	(L(shl_4_bwd_loop_L1)-L(shl_4_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x04(%rsi), %xmm1
-	jb	L(L4_bwd)
-	lea	(L(shl_4_bwd_loop_L2)-L(shl_4_bwd_loop_L1))(%r9), %r9
-L(L4_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_4_bwd_loop_L1):
-	movaps	-0x14(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x24(%rsi), %xmm3
-	movaps	-0x34(%rsi), %xmm4
-	movaps	-0x44(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$4, %xmm2, %xmm1
-	palignr	$4, %xmm3, %xmm2
-	palignr	$4, %xmm4, %xmm3
-	palignr	$4, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_4_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_4_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5):
-	lea	(L(shl_5_loop_L1)-L(shl_5))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_fwd)
-	lea	(L(shl_5_loop_L2)-L(shl_5_loop_L1))(%r9), %r9
-L(L5_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_5_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0b(%rsi), %xmm2
-	movaps	0x1b(%rsi), %xmm3
-	movaps	0x2b(%rsi), %xmm4
-	movaps	0x3b(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$5, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$5, %xmm3, %xmm4
-	palignr	$5, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$5, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_5_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_5_bwd):
-	lea	(L(shl_5_bwd_loop_L1)-L(shl_5_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x05(%rsi), %xmm1
-	jb	L(L5_bwd)
-	lea	(L(shl_5_bwd_loop_L2)-L(shl_5_bwd_loop_L1))(%r9), %r9
-L(L5_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_5_bwd_loop_L1):
-	movaps	-0x15(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x25(%rsi), %xmm3
-	movaps	-0x35(%rsi), %xmm4
-	movaps	-0x45(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$5, %xmm2, %xmm1
-	palignr	$5, %xmm3, %xmm2
-	palignr	$5, %xmm4, %xmm3
-	palignr	$5, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_5_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_5_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6):
-	lea	(L(shl_6_loop_L1)-L(shl_6))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_fwd)
-	lea	(L(shl_6_loop_L2)-L(shl_6_loop_L1))(%r9), %r9
-L(L6_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_6_loop_L1):
-	sub	$64, %rdx
-	movaps	0x0a(%rsi), %xmm2
-	movaps	0x1a(%rsi), %xmm3
-	movaps	0x2a(%rsi), %xmm4
-	movaps	0x3a(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$6, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$6, %xmm3, %xmm4
-	palignr	$6, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$6, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_6_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_6_bwd):
-	lea	(L(shl_6_bwd_loop_L1)-L(shl_6_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x06(%rsi), %xmm1
-	jb	L(L6_bwd)
-	lea	(L(shl_6_bwd_loop_L2)-L(shl_6_bwd_loop_L1))(%r9), %r9
-L(L6_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_6_bwd_loop_L1):
-	movaps	-0x16(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x26(%rsi), %xmm3
-	movaps	-0x36(%rsi), %xmm4
-	movaps	-0x46(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$6, %xmm2, %xmm1
-	palignr	$6, %xmm3, %xmm2
-	palignr	$6, %xmm4, %xmm3
-	palignr	$6, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_6_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_6_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7):
-	lea	(L(shl_7_loop_L1)-L(shl_7))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_fwd)
-	lea	(L(shl_7_loop_L2)-L(shl_7_loop_L1))(%r9), %r9
-L(L7_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_7_loop_L1):
-	sub	$64, %rdx
-	movaps	0x09(%rsi), %xmm2
-	movaps	0x19(%rsi), %xmm3
-	movaps	0x29(%rsi), %xmm4
-	movaps	0x39(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$7, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$7, %xmm3, %xmm4
-	palignr	$7, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$7, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_7_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_7_bwd):
-	lea	(L(shl_7_bwd_loop_L1)-L(shl_7_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x07(%rsi), %xmm1
-	jb	L(L7_bwd)
-	lea	(L(shl_7_bwd_loop_L2)-L(shl_7_bwd_loop_L1))(%r9), %r9
-L(L7_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_7_bwd_loop_L1):
-	movaps	-0x17(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x27(%rsi), %xmm3
-	movaps	-0x37(%rsi), %xmm4
-	movaps	-0x47(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$7, %xmm2, %xmm1
-	palignr	$7, %xmm3, %xmm2
-	palignr	$7, %xmm4, %xmm3
-	palignr	$7, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_7_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_7_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8):
-	lea	(L(shl_8_loop_L1)-L(shl_8))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_fwd)
-	lea	(L(shl_8_loop_L2)-L(shl_8_loop_L1))(%r9), %r9
-L(L8_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-L(shl_8_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_8_loop_L1):
-	sub	$64, %rdx
-	movaps	0x08(%rsi), %xmm2
-	movaps	0x18(%rsi), %xmm3
-	movaps	0x28(%rsi), %xmm4
-	movaps	0x38(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$8, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$8, %xmm3, %xmm4
-	palignr	$8, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$8, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_8_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-	.p2align 4
-L(shl_8_end):
-	lea	64(%rdx), %rdx
-	movaps	%xmm4, -0x20(%rdi)
-	add	%rdx, %rsi
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_8_bwd):
-	lea	(L(shl_8_bwd_loop_L1)-L(shl_8_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x08(%rsi), %xmm1
-	jb	L(L8_bwd)
-	lea	(L(shl_8_bwd_loop_L2)-L(shl_8_bwd_loop_L1))(%r9), %r9
-L(L8_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_8_bwd_loop_L1):
-	movaps	-0x18(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x28(%rsi), %xmm3
-	movaps	-0x38(%rsi), %xmm4
-	movaps	-0x48(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$8, %xmm2, %xmm1
-	palignr	$8, %xmm3, %xmm2
-	palignr	$8, %xmm4, %xmm3
-	palignr	$8, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_8_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_8_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9):
-	lea	(L(shl_9_loop_L1)-L(shl_9))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_fwd)
-	lea	(L(shl_9_loop_L2)-L(shl_9_loop_L1))(%r9), %r9
-L(L9_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_9_loop_L1):
-	sub	$64, %rdx
-	movaps	0x07(%rsi), %xmm2
-	movaps	0x17(%rsi), %xmm3
-	movaps	0x27(%rsi), %xmm4
-	movaps	0x37(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$9, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$9, %xmm3, %xmm4
-	palignr	$9, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$9, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_9_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_9_bwd):
-	lea	(L(shl_9_bwd_loop_L1)-L(shl_9_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x09(%rsi), %xmm1
-	jb	L(L9_bwd)
-	lea	(L(shl_9_bwd_loop_L2)-L(shl_9_bwd_loop_L1))(%r9), %r9
-L(L9_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_9_bwd_loop_L1):
-	movaps	-0x19(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x29(%rsi), %xmm3
-	movaps	-0x39(%rsi), %xmm4
-	movaps	-0x49(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$9, %xmm2, %xmm1
-	palignr	$9, %xmm3, %xmm2
-	palignr	$9, %xmm4, %xmm3
-	palignr	$9, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_9_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_9_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10):
-	lea	(L(shl_10_loop_L1)-L(shl_10))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_fwd)
-	lea	(L(shl_10_loop_L2)-L(shl_10_loop_L1))(%r9), %r9
-L(L10_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_10_loop_L1):
-	sub	$64, %rdx
-	movaps	0x06(%rsi), %xmm2
-	movaps	0x16(%rsi), %xmm3
-	movaps	0x26(%rsi), %xmm4
-	movaps	0x36(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$10, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$10, %xmm3, %xmm4
-	palignr	$10, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$10, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_10_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_10_bwd):
-	lea	(L(shl_10_bwd_loop_L1)-L(shl_10_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0a(%rsi), %xmm1
-	jb	L(L10_bwd)
-	lea	(L(shl_10_bwd_loop_L2)-L(shl_10_bwd_loop_L1))(%r9), %r9
-L(L10_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_10_bwd_loop_L1):
-	movaps	-0x1a(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2a(%rsi), %xmm3
-	movaps	-0x3a(%rsi), %xmm4
-	movaps	-0x4a(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$10, %xmm2, %xmm1
-	palignr	$10, %xmm3, %xmm2
-	palignr	$10, %xmm4, %xmm3
-	palignr	$10, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_10_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_10_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11):
-	lea	(L(shl_11_loop_L1)-L(shl_11))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_fwd)
-	lea	(L(shl_11_loop_L2)-L(shl_11_loop_L1))(%r9), %r9
-L(L11_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_11_loop_L1):
-	sub	$64, %rdx
-	movaps	0x05(%rsi), %xmm2
-	movaps	0x15(%rsi), %xmm3
-	movaps	0x25(%rsi), %xmm4
-	movaps	0x35(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$11, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$11, %xmm3, %xmm4
-	palignr	$11, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$11, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_11_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_11_bwd):
-	lea	(L(shl_11_bwd_loop_L1)-L(shl_11_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0b(%rsi), %xmm1
-	jb	L(L11_bwd)
-	lea	(L(shl_11_bwd_loop_L2)-L(shl_11_bwd_loop_L1))(%r9), %r9
-L(L11_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_11_bwd_loop_L1):
-	movaps	-0x1b(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2b(%rsi), %xmm3
-	movaps	-0x3b(%rsi), %xmm4
-	movaps	-0x4b(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$11, %xmm2, %xmm1
-	palignr	$11, %xmm3, %xmm2
-	palignr	$11, %xmm4, %xmm3
-	palignr	$11, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_11_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_11_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12):
-	lea	(L(shl_12_loop_L1)-L(shl_12))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_fwd)
-	lea	(L(shl_12_loop_L2)-L(shl_12_loop_L1))(%r9), %r9
-L(L12_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_12_loop_L1):
-	sub	$64, %rdx
-	movaps	0x04(%rsi), %xmm2
-	movaps	0x14(%rsi), %xmm3
-	movaps	0x24(%rsi), %xmm4
-	movaps	0x34(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$12, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$12, %xmm3, %xmm4
-	palignr	$12, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$12, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_12_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_12_bwd):
-	lea	(L(shl_12_bwd_loop_L1)-L(shl_12_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0c(%rsi), %xmm1
-	jb	L(L12_bwd)
-	lea	(L(shl_12_bwd_loop_L2)-L(shl_12_bwd_loop_L1))(%r9), %r9
-L(L12_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_12_bwd_loop_L1):
-	movaps	-0x1c(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2c(%rsi), %xmm3
-	movaps	-0x3c(%rsi), %xmm4
-	movaps	-0x4c(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$12, %xmm2, %xmm1
-	palignr	$12, %xmm3, %xmm2
-	palignr	$12, %xmm4, %xmm3
-	palignr	$12, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_12_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_12_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13):
-	lea	(L(shl_13_loop_L1)-L(shl_13))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_fwd)
-	lea	(L(shl_13_loop_L2)-L(shl_13_loop_L1))(%r9), %r9
-L(L13_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_13_loop_L1):
-	sub	$64, %rdx
-	movaps	0x03(%rsi), %xmm2
-	movaps	0x13(%rsi), %xmm3
-	movaps	0x23(%rsi), %xmm4
-	movaps	0x33(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$13, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$13, %xmm3, %xmm4
-	palignr	$13, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$13, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_13_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_13_bwd):
-	lea	(L(shl_13_bwd_loop_L1)-L(shl_13_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0d(%rsi), %xmm1
-	jb	L(L13_bwd)
-	lea	(L(shl_13_bwd_loop_L2)-L(shl_13_bwd_loop_L1))(%r9), %r9
-L(L13_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_13_bwd_loop_L1):
-	movaps	-0x1d(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2d(%rsi), %xmm3
-	movaps	-0x3d(%rsi), %xmm4
-	movaps	-0x4d(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$13, %xmm2, %xmm1
-	palignr	$13, %xmm3, %xmm2
-	palignr	$13, %xmm4, %xmm3
-	palignr	$13, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_13_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_13_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14):
-	lea	(L(shl_14_loop_L1)-L(shl_14))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_fwd)
-	lea	(L(shl_14_loop_L2)-L(shl_14_loop_L1))(%r9), %r9
-L(L14_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_14_loop_L1):
-	sub	$64, %rdx
-	movaps	0x02(%rsi), %xmm2
-	movaps	0x12(%rsi), %xmm3
-	movaps	0x22(%rsi), %xmm4
-	movaps	0x32(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$14, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$14, %xmm3, %xmm4
-	palignr	$14, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$14, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_14_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_14_bwd):
-	lea	(L(shl_14_bwd_loop_L1)-L(shl_14_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0e(%rsi), %xmm1
-	jb	L(L14_bwd)
-	lea	(L(shl_14_bwd_loop_L2)-L(shl_14_bwd_loop_L1))(%r9), %r9
-L(L14_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_14_bwd_loop_L1):
-	movaps	-0x1e(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2e(%rsi), %xmm3
-	movaps	-0x3e(%rsi), %xmm4
-	movaps	-0x4e(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$14, %xmm2, %xmm1
-	palignr	$14, %xmm3, %xmm2
-	palignr	$14, %xmm4, %xmm3
-	palignr	$14, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_14_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_14_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15):
-	lea	(L(shl_15_loop_L1)-L(shl_15))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_fwd)
-	lea	(L(shl_15_loop_L2)-L(shl_15_loop_L1))(%r9), %r9
-L(L15_fwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_loop_L2):
-	prefetchnta 0x1c0(%rsi)
-L(shl_15_loop_L1):
-	sub	$64, %rdx
-	movaps	0x01(%rsi), %xmm2
-	movaps	0x11(%rsi), %xmm3
-	movaps	0x21(%rsi), %xmm4
-	movaps	0x31(%rsi), %xmm5
-	movdqa	%xmm5, %xmm6
-	palignr	$15, %xmm4, %xmm5
-	lea	64(%rsi), %rsi
-	palignr	$15, %xmm3, %xmm4
-	palignr	$15, %xmm2, %xmm3
-	lea	64(%rdi), %rdi
-	palignr	$15, %xmm1, %xmm2
-	movdqa	%xmm6, %xmm1
-	movdqa	%xmm2, -0x40(%rdi)
-	movaps	%xmm3, -0x30(%rdi)
-	jb	L(shl_15_end)
-	movaps	%xmm4, -0x20(%rdi)
-	movaps	%xmm5, -0x10(%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_end):
-	movaps	%xmm4, -0x20(%rdi)
-	lea	64(%rdx), %rdx
-	movaps	%xmm5, -0x10(%rdi)
-	add	%rdx, %rdi
-	movdqu	%xmm0, (%r8)
-	add	%rdx, %rsi
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(shl_15_bwd):
-	lea	(L(shl_15_bwd_loop_L1)-L(shl_15_bwd))(%r9), %r9
-	cmp	%rcx, %rdx
-	movaps	-0x0f(%rsi), %xmm1
-	jb	L(L15_bwd)
-	lea	(L(shl_15_bwd_loop_L2)-L(shl_15_bwd_loop_L1))(%r9), %r9
-L(L15_bwd):
-	lea	-64(%rdx), %rdx
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_loop_L2):
-	prefetchnta -0x1c0(%rsi)
-L(shl_15_bwd_loop_L1):
-	movaps	-0x1f(%rsi), %xmm2
-	sub	$0x40, %rdx
-	movaps	-0x2f(%rsi), %xmm3
-	movaps	-0x3f(%rsi), %xmm4
-	movaps	-0x4f(%rsi), %xmm5
-	lea	-0x40(%rsi), %rsi
-	palignr	$15, %xmm2, %xmm1
-	palignr	$15, %xmm3, %xmm2
-	palignr	$15, %xmm4, %xmm3
-	palignr	$15, %xmm5, %xmm4
-
-	movaps	%xmm1, -0x10(%rdi)
-	movaps	%xmm5, %xmm1
-
-	movaps	%xmm2, -0x20(%rdi)
-	lea	-0x40(%rdi), %rdi
-
-	movaps	%xmm3, 0x10(%rdi)
-	jb	L(shl_15_bwd_end)
-	movaps	%xmm4, (%rdi)
-	_CET_NOTRACK jmp *%r9
-	ud2
-L(shl_15_bwd_end):
-	movaps	%xmm4, (%rdi)
-	lea	64(%rdx), %rdx
-	movdqu	%xmm0, (%r8)
-	BRANCH_TO_JMPTBL_ENTRY(L(table_less_80bytes), %rdx, 4)
-
-	.p2align 4
-L(write_72bytes):
-	movdqu	-72(%rsi), %xmm0
-	movdqu	-56(%rsi), %xmm1
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -72(%rdi)
-	movdqu	 %xmm1, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_64bytes):
-	movdqu	-64(%rsi), %xmm0
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -64(%rdi)
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_56bytes):
-	movdqu	-56(%rsi), %xmm0
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rcx
-	movdqu	 %xmm0, -56(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rcx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_48bytes):
-	mov	-48(%rsi), %rcx
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -48(%rdi)
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_40bytes):
-	mov	-40(%rsi), %r8
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -40(%rdi)
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_32bytes):
-	mov	-32(%rsi), %r9
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -32(%rdi)
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_24bytes):
-	mov	-24(%rsi), %r10
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -24(%rdi)
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_16bytes):
-	mov	-16(%rsi), %r11
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -16(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_8bytes):
-	mov	-8(%rsi), %rdx
-	mov	 %rdx, -8(%rdi)
-L(write_0bytes):
-	ret
-
-	.p2align 4
-L(write_73bytes):
-	movdqu	-73(%rsi), %xmm0
-	movdqu	-57(%rsi), %xmm1
-	mov	-41(%rsi), %rcx
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %r8
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -73(%rdi)
-	movdqu	 %xmm1, -57(%rdi)
-	mov	 %rcx, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %r8, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_65bytes):
-	movdqu	-65(%rsi), %xmm0
-	movdqu	-49(%rsi), %xmm1
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -65(%rdi)
-	movdqu	 %xmm1, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_57bytes):
-	movdqu	-57(%rsi), %xmm0
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -57(%rdi)
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_49bytes):
-	movdqu	-49(%rsi), %xmm0
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -49(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_41bytes):
-	mov	-41(%rsi), %r8
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r8, -41(%rdi)
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_33bytes):
-	mov	-33(%rsi), %r9
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r9, -33(%rdi)
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_25bytes):
-	mov	-25(%rsi), %r10
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-1(%rsi), %dl
-	mov	 %r10, -25(%rdi)
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_17bytes):
-	mov	-17(%rsi), %r11
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -17(%rdi)
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_9bytes):
-	mov	-9(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -9(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_1bytes):
-	mov	-1(%rsi), %dl
-	mov	 %dl, -1(%rdi)
-	ret
-
-	.p2align 4
-L(write_74bytes):
-	movdqu	-74(%rsi), %xmm0
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -74(%rdi)
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_66bytes):
-	movdqu	-66(%rsi), %xmm0
-	movdqu	-50(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -66(%rdi)
-	movdqu	 %xmm1, -50(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_58bytes):
-	movdqu	-58(%rsi), %xmm1
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm1, -58(%rdi)
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_50bytes):
-	movdqu	-50(%rsi), %xmm0
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -50(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_42bytes):
-	mov	-42(%rsi), %r8
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -42(%rdi)
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_34bytes):
-	mov	-34(%rsi), %r9
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -34(%rdi)
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_26bytes):
-	mov	-26(%rsi), %r10
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -26(%rdi)
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_18bytes):
-	mov	-18(%rsi), %r11
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -18(%rdi)
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_10bytes):
-	mov	-10(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -10(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_2bytes):
-	mov	-2(%rsi), %dx
-	mov	 %dx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_75bytes):
-	movdqu	-75(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -75(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_67bytes):
-	movdqu	-67(%rsi), %xmm0
-	movdqu	-59(%rsi), %xmm1
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -67(%rdi)
-	movdqu	 %xmm1, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_59bytes):
-	movdqu	-59(%rsi), %xmm0
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -59(%rdi)
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_51bytes):
-	movdqu	-51(%rsi), %xmm0
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -51(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_43bytes):
-	mov	-43(%rsi), %r8
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -43(%rdi)
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_35bytes):
-	mov	-35(%rsi), %r9
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -35(%rdi)
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_27bytes):
-	mov	-27(%rsi), %r10
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -27(%rdi)
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_19bytes):
-	mov	-19(%rsi), %r11
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -19(%rdi)
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_11bytes):
-	mov	-11(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -11(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_3bytes):
-	mov	-3(%rsi), %dx
-	mov	-2(%rsi), %cx
-	mov	 %dx, -3(%rdi)
-	mov	 %cx, -2(%rdi)
-	ret
-
-	.p2align 4
-L(write_76bytes):
-	movdqu	-76(%rsi), %xmm0
-	movdqu	-60(%rsi), %xmm1
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -76(%rdi)
-	movdqu	 %xmm1, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_68bytes):
-	movdqu	-68(%rsi), %xmm0
-	movdqu	-52(%rsi), %xmm1
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -68(%rdi)
-	movdqu	 %xmm1, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_60bytes):
-	movdqu	-60(%rsi), %xmm0
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -60(%rdi)
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_52bytes):
-	movdqu	-52(%rsi), %xmm0
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	movdqu	 %xmm0, -52(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_44bytes):
-	mov	-44(%rsi), %r8
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r8, -44(%rdi)
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_36bytes):
-	mov	-36(%rsi), %r9
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r9, -36(%rdi)
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_28bytes):
-	mov	-28(%rsi), %r10
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r10, -28(%rdi)
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_20bytes):
-	mov	-20(%rsi), %r11
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %r11, -20(%rdi)
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_12bytes):
-	mov	-12(%rsi), %rcx
-	mov	-4(%rsi), %edx
-	mov	 %rcx, -12(%rdi)
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_4bytes):
-	mov	-4(%rsi), %edx
-	mov	 %edx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_77bytes):
-	movdqu	-77(%rsi), %xmm0
-	movdqu	-61(%rsi), %xmm1
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -77(%rdi)
-	movdqu	 %xmm1, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_69bytes):
-	movdqu	-69(%rsi), %xmm0
-	movdqu	-53(%rsi), %xmm1
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -69(%rdi)
-	movdqu	 %xmm1, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_61bytes):
-	movdqu	-61(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -61(%rdi)
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_53bytes):
-	movdqu	-53(%rsi), %xmm0
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -53(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_45bytes):
-	mov	-45(%rsi), %r8
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -45(%rdi)
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_37bytes):
-	mov	-37(%rsi), %r9
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -37(%rdi)
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_29bytes):
-	mov	-29(%rsi), %r10
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -29(%rdi)
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_21bytes):
-	mov	-21(%rsi), %r11
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -21(%rdi)
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_13bytes):
-	mov	-13(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -13(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_5bytes):
-	mov	-5(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -5(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_78bytes):
-	movdqu	-78(%rsi), %xmm0
-	movdqu	-62(%rsi), %xmm1
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -78(%rdi)
-	movdqu	 %xmm1, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_70bytes):
-	movdqu	-70(%rsi), %xmm0
-	movdqu	-54(%rsi), %xmm1
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -70(%rdi)
-	movdqu	 %xmm1, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_62bytes):
-	movdqu	-62(%rsi), %xmm0
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -62(%rdi)
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_54bytes):
-	movdqu	-54(%rsi), %xmm0
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -54(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_46bytes):
-	mov	-46(%rsi), %r8
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -46(%rdi)
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_38bytes):
-	mov	-38(%rsi), %r9
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -38(%rdi)
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_30bytes):
-	mov	-30(%rsi), %r10
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -30(%rdi)
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_22bytes):
-	mov	-22(%rsi), %r11
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -22(%rdi)
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_14bytes):
-	mov	-14(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -14(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_6bytes):
-	mov	-6(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -6(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(write_79bytes):
-	movdqu	-79(%rsi), %xmm0
-	movdqu	-63(%rsi), %xmm1
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -79(%rdi)
-	movdqu	 %xmm1, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_71bytes):
-	movdqu	-71(%rsi), %xmm0
-	movdqu	-55(%rsi), %xmm1
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -71(%rdi)
-	movdqu	 %xmm1, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_63bytes):
-	movdqu	-63(%rsi), %xmm0
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -63(%rdi)
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_55bytes):
-	movdqu	-55(%rsi), %xmm0
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	movdqu	 %xmm0, -55(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_47bytes):
-	mov	-47(%rsi), %r8
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r8, -47(%rdi)
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_39bytes):
-	mov	-39(%rsi), %r9
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r9, -39(%rdi)
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_31bytes):
-	mov	-31(%rsi), %r10
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r10, -31(%rdi)
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_23bytes):
-	mov	-23(%rsi), %r11
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %r11, -23(%rdi)
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_15bytes):
-	mov	-15(%rsi), %rcx
-	mov	-8(%rsi), %rdx
-	mov	 %rcx, -15(%rdi)
-	mov	 %rdx, -8(%rdi)
-	ret
-
-	.p2align 4
-L(write_7bytes):
-	mov	-7(%rsi), %edx
-	mov	-4(%rsi), %ecx
-	mov	 %edx, -7(%rdi)
-	mov	 %ecx, -4(%rdi)
-	ret
-
-	.p2align 4
-L(large_page_fwd):
-	movdqu	(%rsi), %xmm1
-	lea	16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movntdq	%xmm1, (%rdi)
-	lea	16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rsi, %r9
-	sub	%rdi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_fwd)
-	shl	$2, %rcx
-	cmp	%rcx, %rdx
-	jb	L(ll_cache_copy_fwd_start)
-L(memmove_is_memcpy_fwd):
-#endif
-L(large_page_loop):
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	movntdq	%xmm4, 0x40(%rdi)
-	movntdq	%xmm5, 0x50(%rdi)
-	movntdq	%xmm6, 0x60(%rdi)
-	movntdq	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(large_page_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movntdq	%xmm0, (%rdi)
-	movntdq	%xmm1, 0x10(%rdi)
-	movntdq	%xmm2, 0x20(%rdi)
-	movntdq	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_fwd_start):
-	prefetcht0 0x1c0(%rsi)
-	prefetcht0 0x200(%rsi)
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	movdqu	0x40(%rsi), %xmm4
-	movdqu	0x50(%rsi), %xmm5
-	movdqu	0x60(%rsi), %xmm6
-	movdqu	0x70(%rsi), %xmm7
-	lea	0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	movaps	%xmm4, 0x40(%rdi)
-	movaps	%xmm5, 0x50(%rdi)
-	movaps	%xmm6, 0x60(%rdi)
-	movaps	%xmm7, 0x70(%rdi)
-	lea	0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_fwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_fwd_64bytes)
-
-	movdqu	(%rsi), %xmm0
-	movdqu	0x10(%rsi), %xmm1
-	movdqu	0x20(%rsi), %xmm2
-	movdqu	0x30(%rsi), %xmm3
-	lea	0x40(%rsi), %rsi
-
-	movaps	%xmm0, (%rdi)
-	movaps	%xmm1, 0x10(%rdi)
-	movaps	%xmm2, 0x20(%rdi)
-	movaps	%xmm3, 0x30(%rdi)
-	lea	0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_fwd_64bytes):
-	add	%rdx, %rsi
-	add	%rdx, %rdi
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#endif
-	.p2align 4
-L(large_page_bwd):
-	movdqu	-0x10(%rsi), %xmm1
-	lea	-16(%rsi), %rsi
-	movdqu	%xmm0, (%r8)
-	movdqa	%xmm1, -0x10(%rdi)
-	lea	-16(%rdi), %rdi
-	lea	-0x90(%rdx), %rdx
-#ifdef USE_AS_MEMMOVE
-	mov	%rdi, %r9
-	sub	%rsi, %r9
-	cmp	%rdx, %r9
-	jae	L(memmove_is_memcpy_bwd)
-	cmp	%rcx, %r9
-	jb	L(ll_cache_copy_bwd_start)
-L(memmove_is_memcpy_bwd):
-#endif
-L(large_page_bwd_loop):
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	movntdq	%xmm4, -0x50(%rdi)
-	movntdq	%xmm5, -0x60(%rdi)
-	movntdq	%xmm6, -0x70(%rdi)
-	movntdq	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(large_page_bwd_loop)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movntdq	%xmm0, -0x10(%rdi)
-	movntdq	%xmm1, -0x20(%rdi)
-	movntdq	%xmm2, -0x30(%rdi)
-	movntdq	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_less_bwd_64bytes):
-	sfence
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-
-#ifdef USE_AS_MEMMOVE
-	.p2align 4
-L(ll_cache_copy_bwd_start):
-	prefetcht0 -0x1c0(%rsi)
-	prefetcht0 -0x200(%rsi)
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	movdqu	-0x50(%rsi), %xmm4
-	movdqu	-0x60(%rsi), %xmm5
-	movdqu	-0x70(%rsi), %xmm6
-	movdqu	-0x80(%rsi), %xmm7
-	lea	-0x80(%rsi), %rsi
-
-	sub	$0x80, %rdx
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	movaps	%xmm4, -0x50(%rdi)
-	movaps	%xmm5, -0x60(%rdi)
-	movaps	%xmm6, -0x70(%rdi)
-	movaps	%xmm7, -0x80(%rdi)
-	lea	-0x80(%rdi), %rdi
-	jae	L(ll_cache_copy_bwd_start)
-	cmp	$-0x40, %rdx
-	lea	0x80(%rdx), %rdx
-	jl	L(large_page_ll_less_bwd_64bytes)
-
-	movdqu	-0x10(%rsi), %xmm0
-	movdqu	-0x20(%rsi), %xmm1
-	movdqu	-0x30(%rsi), %xmm2
-	movdqu	-0x40(%rsi), %xmm3
-	lea	-0x40(%rsi), %rsi
-
-	movaps	%xmm0, -0x10(%rdi)
-	movaps	%xmm1, -0x20(%rdi)
-	movaps	%xmm2, -0x30(%rdi)
-	movaps	%xmm3, -0x40(%rdi)
-	lea	-0x40(%rdi), %rdi
-	sub	$0x40, %rdx
-L(large_page_ll_less_bwd_64bytes):
-	BRANCH_TO_JMPTBL_ENTRY (L(table_less_80bytes), %rdx, 4)
-#endif
-
-END (MEMCPY)
-
-	.section .rodata.ssse3,"a",@progbits
-	.p2align 3
-L(table_less_80bytes):
-	.int	JMPTBL (L(write_0bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_1bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_2bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_3bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_4bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_5bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_6bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_7bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_8bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_9bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_10bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_11bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_12bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_13bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_14bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_15bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_16bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_17bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_18bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_19bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_20bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_21bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_22bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_23bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_24bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_25bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_26bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_27bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_28bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_29bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_30bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_31bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_32bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_33bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_34bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_35bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_36bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_37bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_38bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_39bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_40bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_41bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_42bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_43bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_44bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_45bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_46bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_47bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_48bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_49bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_50bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_51bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_52bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_53bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_54bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_55bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_56bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_57bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_58bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_59bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_60bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_61bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_62bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_63bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_64bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_65bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_66bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_67bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_68bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_69bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_70bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_71bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_72bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_73bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_74bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_75bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_76bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_77bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_78bytes), L(table_less_80bytes))
-	.int	JMPTBL (L(write_79bytes), L(table_less_80bytes))
-
-	.p2align 3
-L(shl_table):
-	.int	JMPTBL (L(shl_0), L(shl_table))
-	.int	JMPTBL (L(shl_1), L(shl_table))
-	.int	JMPTBL (L(shl_2), L(shl_table))
-	.int	JMPTBL (L(shl_3), L(shl_table))
-	.int	JMPTBL (L(shl_4), L(shl_table))
-	.int	JMPTBL (L(shl_5), L(shl_table))
-	.int	JMPTBL (L(shl_6), L(shl_table))
-	.int	JMPTBL (L(shl_7), L(shl_table))
-	.int	JMPTBL (L(shl_8), L(shl_table))
-	.int	JMPTBL (L(shl_9), L(shl_table))
-	.int	JMPTBL (L(shl_10), L(shl_table))
-	.int	JMPTBL (L(shl_11), L(shl_table))
-	.int	JMPTBL (L(shl_12), L(shl_table))
-	.int	JMPTBL (L(shl_13), L(shl_table))
-	.int	JMPTBL (L(shl_14), L(shl_table))
-	.int	JMPTBL (L(shl_15), L(shl_table))
-
-	.p2align 3
-L(shl_table_bwd):
-	.int	JMPTBL (L(shl_0_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_1_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_2_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_3_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_4_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_5_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_6_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_7_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_8_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_9_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_10_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_11_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_12_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_13_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_14_bwd), L(shl_table_bwd))
-	.int	JMPTBL (L(shl_15_bwd), L(shl_table_bwd))
-
-#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-ssse3.S b/sysdeps/x86_64/multiarch/memmove-ssse3.S
index 295430b1ef..215583e7bd 100644
--- a/sysdeps/x86_64/multiarch/memmove-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memmove-ssse3.S
@@ -1,4 +1,380 @@
-#define USE_AS_MEMMOVE
-#define MEMCPY		__memmove_ssse3
-#define MEMCPY_CHK	__memmove_chk_ssse3
-#include "memcpy-ssse3.S"
+#include <sysdep.h>
+
+#ifndef MEMMOVE
+# define MEMMOVE	__memmove_ssse3
+# define MEMMOVE_CHK	__memmove_chk_ssse3
+# define MEMCPY	__memcpy_ssse3
+# define MEMCPY_CHK	__memcpy_chk_ssse3
+# define MEMPCPY	__mempcpy_ssse3
+# define MEMPCPY_CHK	__mempcpy_chk_ssse3
+#endif
+
+	.section .text.ssse3, "ax", @progbits
+ENTRY(MEMPCPY_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMPCPY_CHK)
+
+ENTRY(MEMPCPY)
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+	jmp	L(start)
+END(MEMPCPY)
+
+ENTRY(MEMMOVE_CHK)
+	cmp	%RDX_LP, %RCX_LP
+	jb	HIDDEN_JUMPTARGET(__chk_fail)
+END(MEMMOVE_CHK)
+
+ENTRY_P2ALIGN(MEMMOVE, 6)
+	movq	%rdi, %rax
+L(start):
+	cmpq	$16, %rdx
+	jb	L(copy_0_15)
+
+	/* These loads are always useful.  */
+	movups	0(%rsi), %xmm0
+	movups	-16(%rsi, %rdx), %xmm7
+	cmpq	$32, %rdx
+	ja	L(more_2x_vec)
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm7, -16(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_15):
+	cmpl	$4, %edx
+	jb	L(copy_0_3)
+	cmpl	$8, %edx
+	jb	L(copy_4_7)
+	movq	0(%rsi), %rcx
+	movq	-8(%rsi, %rdx), %rsi
+	movq	%rcx, 0(%rdi)
+	movq	%rsi, -8(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_4_7):
+	movl	0(%rsi), %ecx
+	movl	-4(%rsi, %rdx), %esi
+	movl	%ecx, 0(%rdi)
+	movl	%esi, -4(%rdi, %rdx)
+	ret
+
+	.p2align 4,, 4
+L(copy_0_3):
+	decl	%edx
+	jl	L(copy_0_0)
+	movb	(%rsi), %cl
+	je	L(copy_1_1)
+
+	movzwl	-1(%rsi, %rdx), %esi
+	movw	%si, -1(%rdi, %rdx)
+L(copy_1_1):
+	movb	%cl, (%rdi)
+L(copy_0_0):
+	ret
+
+	.p2align 4,, 4
+L(copy_4x_vec):
+	movups	16(%rsi), %xmm1
+	movups	-32(%rsi, %rdx), %xmm2
+
+	movups	%xmm0, 0(%rdi)
+	movups	%xmm1, 16(%rdi)
+	movups	%xmm2, -32(%rdi, %rdx)
+	movups	%xmm7, -16(%rdi, %rdx)
+L(nop):
+	ret
+
+	.p2align 4
+L(more_2x_vec):
+	cmpq	$64, %rdx
+	jbe	L(copy_4x_vec)
+
+	/* We use rcx later to get alignr value.  */
+	movq	%rdi, %rcx
+
+	/* Backward copy for overlap + dst > src for memmove safety.  */
+	subq	%rsi, %rcx
+	cmpq	%rdx, %rcx
+	jb	L(copy_backward)
+
+	/* Load tail.  */
+
+	/* -16(%rsi, %rdx) already loaded into xmm7.  */
+	movups	-32(%rsi, %rdx), %xmm8
+	movups	-48(%rsi, %rdx), %xmm9
+
+	/* Get misalignment.  */
+	andl	$0xf, %ecx
+
+	movq	%rsi, %r9
+	addq	%rcx, %rsi
+	andq	$-16, %rsi
+	/* Get first vec for `palignr`.  */
+	movaps	(%rsi), %xmm1
+
+	/* We have loaded (%rsi) so safe to do this store before the
+	   loop.  */
+	movups	%xmm0, (%rdi)
+
+#ifdef SHARED_CACHE_SIZE_HALF
+	cmp	$SHARED_CACHE_SIZE_HALF, %RDX_LP
+#else
+	cmp	__x86_shared_cache_size_half(%rip), %rdx
+#endif
+	ja	L(large_memcpy)
+
+	leaq	-64(%rdi, %rdx), %r8
+	andq	$-16, %rdi
+	movl	$48, %edx
+
+	leaq	L(loop_fwd_start)(%rip), %r9
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(copy_backward):
+	testq	%rcx, %rcx
+	jz	L(nop)
+
+	/* Preload tail.  */
+
+	/* (%rsi) already loaded into xmm0.  */
+	movups	16(%rsi), %xmm4
+	movups	32(%rsi), %xmm5
+
+	movq	%rdi, %r8
+	subq	%rdi, %rsi
+	leaq	-49(%rdi, %rdx), %rdi
+	andq	$-16, %rdi
+	addq	%rdi, %rsi
+	andq	$-16, %rsi
+
+	movaps	48(%rsi), %xmm6
+
+
+	leaq	L(loop_bkwd_start)(%rip), %r9
+	andl	$0xf, %ecx
+	sall	$6, %ecx
+	addq	%r9, %rcx
+	jmp	* %rcx
+
+	.p2align 4,, 8
+L(large_memcpy):
+	movups	-64(%r9, %rdx), %xmm10
+	movups	-80(%r9, %rdx), %xmm11
+
+	sall	$5, %ecx
+	leal	(%rcx, %rcx, 2), %r8d
+	leaq	-96(%rdi, %rdx), %rcx
+	andq	$-16, %rdi
+	leaq	L(large_loop_fwd_start)(%rip), %rdx
+	addq	%r8, %rdx
+	jmp	* %rdx
+
+
+	/* Instead of a typical jump table all 16 loops are exactly
+	   64-bytes in size. So, we can just jump to first loop + r8 *
+	   64. Before modifying any loop ensure all their sizes match!
+	 */
+	.p2align 6
+L(loop_fwd_start):
+L(loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	%xmm1, 16(%rdi)
+	movaps	%xmm2, 32(%rdi)
+	movaps	%xmm3, 48(%rdi)
+	addq	%rdx, %rdi
+	addq	%rdx, %rsi
+	cmpq	%rdi, %r8
+	ja	L(loop_fwd_0x0)
+L(end_loop_fwd):
+	movups	%xmm9, 16(%r8)
+	movups	%xmm8, 32(%r8)
+	movups	%xmm7, 48(%r8)
+	ret
+
+	/* Extactly 64 bytes if `jmp L(end_loop_fwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_FWD(align_by);	\
+	.p2align 6;	\
+L(loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	%xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm4, %xmm1;	\
+	movaps	%xmm0, 16(%rdi);	\
+	movaps	%xmm2, 32(%rdi);	\
+	movaps	%xmm3, 48(%rdi);	\
+	addq	%rdx, %rdi;	\
+	addq	%rdx, %rsi;	\
+	cmpq	%rdi, %r8;	\
+	ja	L(loop_fwd_ ## align_by);	\
+	jmp	L(end_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_FWD (0xf)
+	ALIGNED_LOOP_FWD (0xe)
+	ALIGNED_LOOP_FWD (0xd)
+	ALIGNED_LOOP_FWD (0xc)
+	ALIGNED_LOOP_FWD (0xb)
+	ALIGNED_LOOP_FWD (0xa)
+	ALIGNED_LOOP_FWD (0x9)
+	ALIGNED_LOOP_FWD (0x8)
+	ALIGNED_LOOP_FWD (0x7)
+	ALIGNED_LOOP_FWD (0x6)
+	ALIGNED_LOOP_FWD (0x5)
+	ALIGNED_LOOP_FWD (0x4)
+	ALIGNED_LOOP_FWD (0x3)
+	ALIGNED_LOOP_FWD (0x2)
+	ALIGNED_LOOP_FWD (0x1)
+
+	.p2align 6
+L(large_loop_fwd_start):
+L(large_loop_fwd_0x0):
+	movaps	16(%rsi), %xmm1
+	movaps	32(%rsi), %xmm2
+	movaps	48(%rsi), %xmm3
+	movaps	64(%rsi), %xmm4
+	movaps	80(%rsi), %xmm5
+	movntps	%xmm1, 16(%rdi)
+	movntps	%xmm2, 32(%rdi)
+	movntps	%xmm3, 48(%rdi)
+	movntps	%xmm4, 64(%rdi)
+	movntps	%xmm5, 80(%rdi)
+	addq	$80, %rdi
+	addq	$80, %rsi
+	cmpq	%rdi, %rcx
+	ja	L(large_loop_fwd_0x0)
+
+	/* Ensure no icache line split on tail.  */
+	.p2align 4
+L(end_large_loop_fwd):
+	sfence
+	movups	%xmm11, 16(%rcx)
+	movups	%xmm10, 32(%rcx)
+	movups	%xmm9, 48(%rcx)
+	movups	%xmm8, 64(%rcx)
+	movups	%xmm7, 80(%rcx)
+	ret
+
+
+	/* Size > 64 bytes and <= 96 bytes. 32-byte align between ensure
+	   96-byte spacing between each.  */
+#define ALIGNED_LARGE_LOOP_FWD(align_by);	\
+	.p2align 5;	\
+L(large_loop_fwd_ ## align_by):	\
+	movaps	16(%rsi), %xmm0;	\
+	movaps	32(%rsi), %xmm2;	\
+	movaps	48(%rsi), %xmm3;	\
+	movaps	64(%rsi), %xmm4;	\
+	movaps	80(%rsi), %xmm5;	\
+	movaps	%xmm5, %xmm6;	\
+	palignr	$align_by, %xmm4, %xmm5;	\
+	palignr	$align_by, %xmm3, %xmm4;	\
+	palignr	$align_by, %xmm2, %xmm3;	\
+	palignr	$align_by, %xmm0, %xmm2;	\
+	palignr	$align_by, %xmm1, %xmm0;	\
+	movaps	%xmm6, %xmm1;	\
+	movntps	%xmm0, 16(%rdi);	\
+	movntps	%xmm2, 32(%rdi);	\
+	movntps	%xmm3, 48(%rdi);	\
+	movntps	%xmm4, 64(%rdi);	\
+	movntps	%xmm5, 80(%rdi);	\
+	addq	$80, %rdi;	\
+	addq	$80, %rsi;	\
+	cmpq	%rdi, %rcx;	\
+	ja	L(large_loop_fwd_ ## align_by);	\
+	jmp	L(end_large_loop_fwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LARGE_LOOP_FWD (0xf)
+	ALIGNED_LARGE_LOOP_FWD (0xe)
+	ALIGNED_LARGE_LOOP_FWD (0xd)
+	ALIGNED_LARGE_LOOP_FWD (0xc)
+	ALIGNED_LARGE_LOOP_FWD (0xb)
+	ALIGNED_LARGE_LOOP_FWD (0xa)
+	ALIGNED_LARGE_LOOP_FWD (0x9)
+	ALIGNED_LARGE_LOOP_FWD (0x8)
+	ALIGNED_LARGE_LOOP_FWD (0x7)
+	ALIGNED_LARGE_LOOP_FWD (0x6)
+	ALIGNED_LARGE_LOOP_FWD (0x5)
+	ALIGNED_LARGE_LOOP_FWD (0x4)
+	ALIGNED_LARGE_LOOP_FWD (0x3)
+	ALIGNED_LARGE_LOOP_FWD (0x2)
+	ALIGNED_LARGE_LOOP_FWD (0x1)
+
+
+	.p2align 6
+L(loop_bkwd_start):
+L(loop_bkwd_0x0):
+	movaps	32(%rsi), %xmm1
+	movaps	16(%rsi), %xmm2
+	movaps	0(%rsi), %xmm3
+	movaps	%xmm1, 32(%rdi)
+	movaps	%xmm2, 16(%rdi)
+	movaps	%xmm3, 0(%rdi)
+	subq	$48, %rdi
+	subq	$48, %rsi
+	cmpq	%rdi, %r8
+	jb	L(loop_bkwd_0x0)
+L(end_loop_bkwd):
+	movups	%xmm7, -16(%r8, %rdx)
+	movups	%xmm0, 0(%r8)
+	movups	%xmm4, 16(%r8)
+	movups	%xmm5, 32(%r8)
+
+	ret
+
+
+	/* Extactly 64 bytes if `jmp L(end_loop_bkwd)` is long encoding.
+	   60 bytes otherwise.  */
+#define ALIGNED_LOOP_BKWD(align_by);	\
+	.p2align 6;	\
+L(loop_bkwd_ ## align_by):	\
+	movaps	32(%rsi), %xmm1;	\
+	movaps	16(%rsi), %xmm2;	\
+	movaps	0(%rsi), %xmm3;	\
+	palignr	$align_by, %xmm1, %xmm6;	\
+	palignr	$align_by, %xmm2, %xmm1;	\
+	palignr	$align_by, %xmm3, %xmm2;	\
+	movaps	%xmm6, 32(%rdi);	\
+	movaps	%xmm1, 16(%rdi);	\
+	movaps	%xmm2, 0(%rdi);	\
+	subq	$48, %rdi;	\
+	subq	$48, %rsi;	\
+	movaps	%xmm3, %xmm6;	\
+	cmpq	%rdi, %r8;	\
+	jb	L(loop_bkwd_ ## align_by);	\
+	jmp	L(end_loop_bkwd);
+
+	/* Must be in descending order.  */
+	ALIGNED_LOOP_BKWD (0xf)
+	ALIGNED_LOOP_BKWD (0xe)
+	ALIGNED_LOOP_BKWD (0xd)
+	ALIGNED_LOOP_BKWD (0xc)
+	ALIGNED_LOOP_BKWD (0xb)
+	ALIGNED_LOOP_BKWD (0xa)
+	ALIGNED_LOOP_BKWD (0x9)
+	ALIGNED_LOOP_BKWD (0x8)
+	ALIGNED_LOOP_BKWD (0x7)
+	ALIGNED_LOOP_BKWD (0x6)
+	ALIGNED_LOOP_BKWD (0x5)
+	ALIGNED_LOOP_BKWD (0x4)
+	ALIGNED_LOOP_BKWD (0x3)
+	ALIGNED_LOOP_BKWD (0x2)
+	ALIGNED_LOOP_BKWD (0x1)
+END(MEMMOVE)
+
+strong_alias (MEMMOVE, MEMCPY)
+strong_alias (MEMMOVE_CHK, MEMCPY_CHK)


^ permalink raw reply	[flat|nested] only message in thread

only message in thread, other threads:[~2022-04-15  5:02 UTC | newest]

Thread overview: (only message) (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-15  5:02 [glibc] x86: Reduce code size of mem{move|pcpy|cpy}-ssse3 Noah Goldstein

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for read-only IMAP folder(s) and NNTP newsgroup(s).